linux/fs/read_write.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/aio.h>
  13#include <linux/fsnotify.h>
  14#include <linux/security.h>
  15#include <linux/export.h>
  16#include <linux/syscalls.h>
  17#include <linux/pagemap.h>
  18#include <linux/splice.h>
  19#include <linux/compat.h>
  20#include "internal.h"
  21
  22#include <asm/uaccess.h>
  23#include <asm/unistd.h>
  24
  25typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  26typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  27                unsigned long, loff_t);
  28typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
  29
  30const struct file_operations generic_ro_fops = {
  31        .llseek         = generic_file_llseek,
  32        .read           = new_sync_read,
  33        .read_iter      = generic_file_read_iter,
  34        .mmap           = generic_file_readonly_mmap,
  35        .splice_read    = generic_file_splice_read,
  36};
  37
  38EXPORT_SYMBOL(generic_ro_fops);
  39
  40static inline int unsigned_offsets(struct file *file)
  41{
  42        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  43}
  44
  45/**
  46 * vfs_setpos - update the file offset for lseek
  47 * @file:       file structure in question
  48 * @offset:     file offset to seek to
  49 * @maxsize:    maximum file size
  50 *
  51 * This is a low-level filesystem helper for updating the file offset to
  52 * the value specified by @offset if the given offset is valid and it is
  53 * not equal to the current file offset.
  54 *
  55 * Return the specified offset on success and -EINVAL on invalid offset.
  56 */
  57loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  58{
  59        if (offset < 0 && !unsigned_offsets(file))
  60                return -EINVAL;
  61        if (offset > maxsize)
  62                return -EINVAL;
  63
  64        if (offset != file->f_pos) {
  65                file->f_pos = offset;
  66                file->f_version = 0;
  67        }
  68        return offset;
  69}
  70EXPORT_SYMBOL(vfs_setpos);
  71
  72/**
  73 * generic_file_llseek_size - generic llseek implementation for regular files
  74 * @file:       file structure to seek on
  75 * @offset:     file offset to seek to
  76 * @whence:     type of seek
  77 * @size:       max size of this file in file system
  78 * @eof:        offset used for SEEK_END position
  79 *
  80 * This is a variant of generic_file_llseek that allows passing in a custom
  81 * maximum file size and a custom EOF position, for e.g. hashed directories
  82 *
  83 * Synchronization:
  84 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  85 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  86 * read/writes behave like SEEK_SET against seeks.
  87 */
  88loff_t
  89generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  90                loff_t maxsize, loff_t eof)
  91{
  92        switch (whence) {
  93        case SEEK_END:
  94                offset += eof;
  95                break;
  96        case SEEK_CUR:
  97                /*
  98                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  99                 * position-querying operation.  Avoid rewriting the "same"
 100                 * f_pos value back to the file because a concurrent read(),
 101                 * write() or lseek() might have altered it
 102                 */
 103                if (offset == 0)
 104                        return file->f_pos;
 105                /*
 106                 * f_lock protects against read/modify/write race with other
 107                 * SEEK_CURs. Note that parallel writes and reads behave
 108                 * like SEEK_SET.
 109                 */
 110                spin_lock(&file->f_lock);
 111                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 112                spin_unlock(&file->f_lock);
 113                return offset;
 114        case SEEK_DATA:
 115                /*
 116                 * In the generic case the entire file is data, so as long as
 117                 * offset isn't at the end of the file then the offset is data.
 118                 */
 119                if (offset >= eof)
 120                        return -ENXIO;
 121                break;
 122        case SEEK_HOLE:
 123                /*
 124                 * There is a virtual hole at the end of the file, so as long as
 125                 * offset isn't i_size or larger, return i_size.
 126                 */
 127                if (offset >= eof)
 128                        return -ENXIO;
 129                offset = eof;
 130                break;
 131        }
 132
 133        return vfs_setpos(file, offset, maxsize);
 134}
 135EXPORT_SYMBOL(generic_file_llseek_size);
 136
 137/**
 138 * generic_file_llseek - generic llseek implementation for regular files
 139 * @file:       file structure to seek on
 140 * @offset:     file offset to seek to
 141 * @whence:     type of seek
 142 *
 143 * This is a generic implemenation of ->llseek useable for all normal local
 144 * filesystems.  It just updates the file offset to the value specified by
 145 * @offset and @whence.
 146 */
 147loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 148{
 149        struct inode *inode = file->f_mapping->host;
 150
 151        return generic_file_llseek_size(file, offset, whence,
 152                                        inode->i_sb->s_maxbytes,
 153                                        i_size_read(inode));
 154}
 155EXPORT_SYMBOL(generic_file_llseek);
 156
 157/**
 158 * fixed_size_llseek - llseek implementation for fixed-sized devices
 159 * @file:       file structure to seek on
 160 * @offset:     file offset to seek to
 161 * @whence:     type of seek
 162 * @size:       size of the file
 163 *
 164 */
 165loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 166{
 167        switch (whence) {
 168        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 169                return generic_file_llseek_size(file, offset, whence,
 170                                                size, size);
 171        default:
 172                return -EINVAL;
 173        }
 174}
 175EXPORT_SYMBOL(fixed_size_llseek);
 176
 177/**
 178 * noop_llseek - No Operation Performed llseek implementation
 179 * @file:       file structure to seek on
 180 * @offset:     file offset to seek to
 181 * @whence:     type of seek
 182 *
 183 * This is an implementation of ->llseek useable for the rare special case when
 184 * userspace expects the seek to succeed but the (device) file is actually not
 185 * able to perform the seek. In this case you use noop_llseek() instead of
 186 * falling back to the default implementation of ->llseek.
 187 */
 188loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 189{
 190        return file->f_pos;
 191}
 192EXPORT_SYMBOL(noop_llseek);
 193
 194loff_t no_llseek(struct file *file, loff_t offset, int whence)
 195{
 196        return -ESPIPE;
 197}
 198EXPORT_SYMBOL(no_llseek);
 199
 200loff_t default_llseek(struct file *file, loff_t offset, int whence)
 201{
 202        struct inode *inode = file_inode(file);
 203        loff_t retval;
 204
 205        mutex_lock(&inode->i_mutex);
 206        switch (whence) {
 207                case SEEK_END:
 208                        offset += i_size_read(inode);
 209                        break;
 210                case SEEK_CUR:
 211                        if (offset == 0) {
 212                                retval = file->f_pos;
 213                                goto out;
 214                        }
 215                        offset += file->f_pos;
 216                        break;
 217                case SEEK_DATA:
 218                        /*
 219                         * In the generic case the entire file is data, so as
 220                         * long as offset isn't at the end of the file then the
 221                         * offset is data.
 222                         */
 223                        if (offset >= inode->i_size) {
 224                                retval = -ENXIO;
 225                                goto out;
 226                        }
 227                        break;
 228                case SEEK_HOLE:
 229                        /*
 230                         * There is a virtual hole at the end of the file, so
 231                         * as long as offset isn't i_size or larger, return
 232                         * i_size.
 233                         */
 234                        if (offset >= inode->i_size) {
 235                                retval = -ENXIO;
 236                                goto out;
 237                        }
 238                        offset = inode->i_size;
 239                        break;
 240        }
 241        retval = -EINVAL;
 242        if (offset >= 0 || unsigned_offsets(file)) {
 243                if (offset != file->f_pos) {
 244                        file->f_pos = offset;
 245                        file->f_version = 0;
 246                }
 247                retval = offset;
 248        }
 249out:
 250        mutex_unlock(&inode->i_mutex);
 251        return retval;
 252}
 253EXPORT_SYMBOL(default_llseek);
 254
 255loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 256{
 257        loff_t (*fn)(struct file *, loff_t, int);
 258
 259        fn = no_llseek;
 260        if (file->f_mode & FMODE_LSEEK) {
 261                if (file->f_op->llseek)
 262                        fn = file->f_op->llseek;
 263        }
 264        return fn(file, offset, whence);
 265}
 266EXPORT_SYMBOL(vfs_llseek);
 267
 268static inline struct fd fdget_pos(int fd)
 269{
 270        return __to_fd(__fdget_pos(fd));
 271}
 272
 273static inline void fdput_pos(struct fd f)
 274{
 275        if (f.flags & FDPUT_POS_UNLOCK)
 276                mutex_unlock(&f.file->f_pos_lock);
 277        fdput(f);
 278}
 279
 280SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 281{
 282        off_t retval;
 283        struct fd f = fdget_pos(fd);
 284        if (!f.file)
 285                return -EBADF;
 286
 287        retval = -EINVAL;
 288        if (whence <= SEEK_MAX) {
 289                loff_t res = vfs_llseek(f.file, offset, whence);
 290                retval = res;
 291                if (res != (loff_t)retval)
 292                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 293        }
 294        fdput_pos(f);
 295        return retval;
 296}
 297
 298#ifdef CONFIG_COMPAT
 299COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 300{
 301        return sys_lseek(fd, offset, whence);
 302}
 303#endif
 304
 305#ifdef __ARCH_WANT_SYS_LLSEEK
 306SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 307                unsigned long, offset_low, loff_t __user *, result,
 308                unsigned int, whence)
 309{
 310        int retval;
 311        struct fd f = fdget_pos(fd);
 312        loff_t offset;
 313
 314        if (!f.file)
 315                return -EBADF;
 316
 317        retval = -EINVAL;
 318        if (whence > SEEK_MAX)
 319                goto out_putf;
 320
 321        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 322                        whence);
 323
 324        retval = (int)offset;
 325        if (offset >= 0) {
 326                retval = -EFAULT;
 327                if (!copy_to_user(result, &offset, sizeof(offset)))
 328                        retval = 0;
 329        }
 330out_putf:
 331        fdput_pos(f);
 332        return retval;
 333}
 334#endif
 335
 336/*
 337 * rw_verify_area doesn't like huge counts. We limit
 338 * them to something that fits in "int" so that others
 339 * won't have to do range checks all the time.
 340 */
 341int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 342{
 343        struct inode *inode;
 344        loff_t pos;
 345        int retval = -EINVAL;
 346
 347        inode = file_inode(file);
 348        if (unlikely((ssize_t) count < 0))
 349                return retval;
 350        pos = *ppos;
 351        if (unlikely(pos < 0)) {
 352                if (!unsigned_offsets(file))
 353                        return retval;
 354                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 355                        return -EOVERFLOW;
 356        } else if (unlikely((loff_t) (pos + count) < 0)) {
 357                if (!unsigned_offsets(file))
 358                        return retval;
 359        }
 360
 361        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 362                retval = locks_mandatory_area(
 363                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 364                        inode, file, pos, count);
 365                if (retval < 0)
 366                        return retval;
 367        }
 368        retval = security_file_permission(file,
 369                                read_write == READ ? MAY_READ : MAY_WRITE);
 370        if (retval)
 371                return retval;
 372        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 373}
 374
 375ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 376{
 377        struct iovec iov = { .iov_base = buf, .iov_len = len };
 378        struct kiocb kiocb;
 379        ssize_t ret;
 380
 381        init_sync_kiocb(&kiocb, filp);
 382        kiocb.ki_pos = *ppos;
 383        kiocb.ki_nbytes = len;
 384
 385        ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 386        if (-EIOCBQUEUED == ret)
 387                ret = wait_on_sync_kiocb(&kiocb);
 388        *ppos = kiocb.ki_pos;
 389        return ret;
 390}
 391
 392EXPORT_SYMBOL(do_sync_read);
 393
 394ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 395{
 396        struct iovec iov = { .iov_base = buf, .iov_len = len };
 397        struct kiocb kiocb;
 398        struct iov_iter iter;
 399        ssize_t ret;
 400
 401        init_sync_kiocb(&kiocb, filp);
 402        kiocb.ki_pos = *ppos;
 403        kiocb.ki_nbytes = len;
 404        iov_iter_init(&iter, READ, &iov, 1, len);
 405
 406        ret = filp->f_op->read_iter(&kiocb, &iter);
 407        if (-EIOCBQUEUED == ret)
 408                ret = wait_on_sync_kiocb(&kiocb);
 409        *ppos = kiocb.ki_pos;
 410        return ret;
 411}
 412
 413EXPORT_SYMBOL(new_sync_read);
 414
 415ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 416                   loff_t *pos)
 417{
 418        ssize_t ret;
 419
 420        if (file->f_op->read)
 421                ret = file->f_op->read(file, buf, count, pos);
 422        else if (file->f_op->aio_read)
 423                ret = do_sync_read(file, buf, count, pos);
 424        else if (file->f_op->read_iter)
 425                ret = new_sync_read(file, buf, count, pos);
 426        else
 427                ret = -EINVAL;
 428
 429        return ret;
 430}
 431
 432ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 433{
 434        ssize_t ret;
 435
 436        if (!(file->f_mode & FMODE_READ))
 437                return -EBADF;
 438        if (!(file->f_mode & FMODE_CAN_READ))
 439                return -EINVAL;
 440        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 441                return -EFAULT;
 442
 443        ret = rw_verify_area(READ, file, pos, count);
 444        if (ret >= 0) {
 445                count = ret;
 446                ret = __vfs_read(file, buf, count, pos);
 447                if (ret > 0) {
 448                        fsnotify_access(file);
 449                        add_rchar(current, ret);
 450                }
 451                inc_syscr(current);
 452        }
 453
 454        return ret;
 455}
 456
 457EXPORT_SYMBOL(vfs_read);
 458
 459ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 460{
 461        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 462        struct kiocb kiocb;
 463        ssize_t ret;
 464
 465        init_sync_kiocb(&kiocb, filp);
 466        kiocb.ki_pos = *ppos;
 467        kiocb.ki_nbytes = len;
 468
 469        ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 470        if (-EIOCBQUEUED == ret)
 471                ret = wait_on_sync_kiocb(&kiocb);
 472        *ppos = kiocb.ki_pos;
 473        return ret;
 474}
 475
 476EXPORT_SYMBOL(do_sync_write);
 477
 478ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 479{
 480        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 481        struct kiocb kiocb;
 482        struct iov_iter iter;
 483        ssize_t ret;
 484
 485        init_sync_kiocb(&kiocb, filp);
 486        kiocb.ki_pos = *ppos;
 487        kiocb.ki_nbytes = len;
 488        iov_iter_init(&iter, WRITE, &iov, 1, len);
 489
 490        ret = filp->f_op->write_iter(&kiocb, &iter);
 491        if (-EIOCBQUEUED == ret)
 492                ret = wait_on_sync_kiocb(&kiocb);
 493        *ppos = kiocb.ki_pos;
 494        return ret;
 495}
 496
 497EXPORT_SYMBOL(new_sync_write);
 498
 499ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 500{
 501        mm_segment_t old_fs;
 502        const char __user *p;
 503        ssize_t ret;
 504
 505        if (!(file->f_mode & FMODE_CAN_WRITE))
 506                return -EINVAL;
 507
 508        old_fs = get_fs();
 509        set_fs(get_ds());
 510        p = (__force const char __user *)buf;
 511        if (count > MAX_RW_COUNT)
 512                count =  MAX_RW_COUNT;
 513        if (file->f_op->write)
 514                ret = file->f_op->write(file, p, count, pos);
 515        else if (file->f_op->aio_write)
 516                ret = do_sync_write(file, p, count, pos);
 517        else
 518                ret = new_sync_write(file, p, count, pos);
 519        set_fs(old_fs);
 520        if (ret > 0) {
 521                fsnotify_modify(file);
 522                add_wchar(current, ret);
 523        }
 524        inc_syscw(current);
 525        return ret;
 526}
 527
 528EXPORT_SYMBOL(__kernel_write);
 529
 530ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 531{
 532        ssize_t ret;
 533
 534        if (!(file->f_mode & FMODE_WRITE))
 535                return -EBADF;
 536        if (!(file->f_mode & FMODE_CAN_WRITE))
 537                return -EINVAL;
 538        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 539                return -EFAULT;
 540
 541        ret = rw_verify_area(WRITE, file, pos, count);
 542        if (ret >= 0) {
 543                count = ret;
 544                file_start_write(file);
 545                if (file->f_op->write)
 546                        ret = file->f_op->write(file, buf, count, pos);
 547                else if (file->f_op->aio_write)
 548                        ret = do_sync_write(file, buf, count, pos);
 549                else
 550                        ret = new_sync_write(file, buf, count, pos);
 551                if (ret > 0) {
 552                        fsnotify_modify(file);
 553                        add_wchar(current, ret);
 554                }
 555                inc_syscw(current);
 556                file_end_write(file);
 557        }
 558
 559        return ret;
 560}
 561
 562EXPORT_SYMBOL(vfs_write);
 563
 564static inline loff_t file_pos_read(struct file *file)
 565{
 566        return file->f_pos;
 567}
 568
 569static inline void file_pos_write(struct file *file, loff_t pos)
 570{
 571        file->f_pos = pos;
 572}
 573
 574SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 575{
 576        struct fd f = fdget_pos(fd);
 577        ssize_t ret = -EBADF;
 578
 579        if (f.file) {
 580                loff_t pos = file_pos_read(f.file);
 581                ret = vfs_read(f.file, buf, count, &pos);
 582                if (ret >= 0)
 583                        file_pos_write(f.file, pos);
 584                fdput_pos(f);
 585        }
 586        return ret;
 587}
 588
 589SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 590                size_t, count)
 591{
 592        struct fd f = fdget_pos(fd);
 593        ssize_t ret = -EBADF;
 594
 595        if (f.file) {
 596                loff_t pos = file_pos_read(f.file);
 597                ret = vfs_write(f.file, buf, count, &pos);
 598                if (ret >= 0)
 599                        file_pos_write(f.file, pos);
 600                fdput_pos(f);
 601        }
 602
 603        return ret;
 604}
 605
 606SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 607                        size_t, count, loff_t, pos)
 608{
 609        struct fd f;
 610        ssize_t ret = -EBADF;
 611
 612        if (pos < 0)
 613                return -EINVAL;
 614
 615        f = fdget(fd);
 616        if (f.file) {
 617                ret = -ESPIPE;
 618                if (f.file->f_mode & FMODE_PREAD)
 619                        ret = vfs_read(f.file, buf, count, &pos);
 620                fdput(f);
 621        }
 622
 623        return ret;
 624}
 625
 626SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 627                         size_t, count, loff_t, pos)
 628{
 629        struct fd f;
 630        ssize_t ret = -EBADF;
 631
 632        if (pos < 0)
 633                return -EINVAL;
 634
 635        f = fdget(fd);
 636        if (f.file) {
 637                ret = -ESPIPE;
 638                if (f.file->f_mode & FMODE_PWRITE)  
 639                        ret = vfs_write(f.file, buf, count, &pos);
 640                fdput(f);
 641        }
 642
 643        return ret;
 644}
 645
 646/*
 647 * Reduce an iovec's length in-place.  Return the resulting number of segments
 648 */
 649unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 650{
 651        unsigned long seg = 0;
 652        size_t len = 0;
 653
 654        while (seg < nr_segs) {
 655                seg++;
 656                if (len + iov->iov_len >= to) {
 657                        iov->iov_len = to - len;
 658                        break;
 659                }
 660                len += iov->iov_len;
 661                iov++;
 662        }
 663        return seg;
 664}
 665EXPORT_SYMBOL(iov_shorten);
 666
 667static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
 668                unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
 669{
 670        struct kiocb kiocb;
 671        struct iov_iter iter;
 672        ssize_t ret;
 673
 674        init_sync_kiocb(&kiocb, filp);
 675        kiocb.ki_pos = *ppos;
 676        kiocb.ki_nbytes = len;
 677
 678        iov_iter_init(&iter, rw, iov, nr_segs, len);
 679        ret = fn(&kiocb, &iter);
 680        if (ret == -EIOCBQUEUED)
 681                ret = wait_on_sync_kiocb(&kiocb);
 682        *ppos = kiocb.ki_pos;
 683        return ret;
 684}
 685
 686static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 687                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 688{
 689        struct kiocb kiocb;
 690        ssize_t ret;
 691
 692        init_sync_kiocb(&kiocb, filp);
 693        kiocb.ki_pos = *ppos;
 694        kiocb.ki_nbytes = len;
 695
 696        ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 697        if (ret == -EIOCBQUEUED)
 698                ret = wait_on_sync_kiocb(&kiocb);
 699        *ppos = kiocb.ki_pos;
 700        return ret;
 701}
 702
 703/* Do it by hand, with file-ops */
 704static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 705                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 706{
 707        struct iovec *vector = iov;
 708        ssize_t ret = 0;
 709
 710        while (nr_segs > 0) {
 711                void __user *base;
 712                size_t len;
 713                ssize_t nr;
 714
 715                base = vector->iov_base;
 716                len = vector->iov_len;
 717                vector++;
 718                nr_segs--;
 719
 720                nr = fn(filp, base, len, ppos);
 721
 722                if (nr < 0) {
 723                        if (!ret)
 724                                ret = nr;
 725                        break;
 726                }
 727                ret += nr;
 728                if (nr != len)
 729                        break;
 730        }
 731
 732        return ret;
 733}
 734
 735/* A write operation does a read from user space and vice versa */
 736#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 737
 738ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 739                              unsigned long nr_segs, unsigned long fast_segs,
 740                              struct iovec *fast_pointer,
 741                              struct iovec **ret_pointer)
 742{
 743        unsigned long seg;
 744        ssize_t ret;
 745        struct iovec *iov = fast_pointer;
 746
 747        /*
 748         * SuS says "The readv() function *may* fail if the iovcnt argument
 749         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 750         * traditionally returned zero for zero segments, so...
 751         */
 752        if (nr_segs == 0) {
 753                ret = 0;
 754                goto out;
 755        }
 756
 757        /*
 758         * First get the "struct iovec" from user memory and
 759         * verify all the pointers
 760         */
 761        if (nr_segs > UIO_MAXIOV) {
 762                ret = -EINVAL;
 763                goto out;
 764        }
 765        if (nr_segs > fast_segs) {
 766                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 767                if (iov == NULL) {
 768                        ret = -ENOMEM;
 769                        goto out;
 770                }
 771        }
 772        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 773                ret = -EFAULT;
 774                goto out;
 775        }
 776
 777        /*
 778         * According to the Single Unix Specification we should return EINVAL
 779         * if an element length is < 0 when cast to ssize_t or if the
 780         * total length would overflow the ssize_t return value of the
 781         * system call.
 782         *
 783         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 784         * overflow case.
 785         */
 786        ret = 0;
 787        for (seg = 0; seg < nr_segs; seg++) {
 788                void __user *buf = iov[seg].iov_base;
 789                ssize_t len = (ssize_t)iov[seg].iov_len;
 790
 791                /* see if we we're about to use an invalid len or if
 792                 * it's about to overflow ssize_t */
 793                if (len < 0) {
 794                        ret = -EINVAL;
 795                        goto out;
 796                }
 797                if (type >= 0
 798                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 799                        ret = -EFAULT;
 800                        goto out;
 801                }
 802                if (len > MAX_RW_COUNT - ret) {
 803                        len = MAX_RW_COUNT - ret;
 804                        iov[seg].iov_len = len;
 805                }
 806                ret += len;
 807        }
 808out:
 809        *ret_pointer = iov;
 810        return ret;
 811}
 812
 813static ssize_t do_readv_writev(int type, struct file *file,
 814                               const struct iovec __user * uvector,
 815                               unsigned long nr_segs, loff_t *pos)
 816{
 817        size_t tot_len;
 818        struct iovec iovstack[UIO_FASTIOV];
 819        struct iovec *iov = iovstack;
 820        ssize_t ret;
 821        io_fn_t fn;
 822        iov_fn_t fnv;
 823        iter_fn_t iter_fn;
 824
 825        ret = rw_copy_check_uvector(type, uvector, nr_segs,
 826                                    ARRAY_SIZE(iovstack), iovstack, &iov);
 827        if (ret <= 0)
 828                goto out;
 829
 830        tot_len = ret;
 831        ret = rw_verify_area(type, file, pos, tot_len);
 832        if (ret < 0)
 833                goto out;
 834
 835        fnv = NULL;
 836        if (type == READ) {
 837                fn = file->f_op->read;
 838                fnv = file->f_op->aio_read;
 839                iter_fn = file->f_op->read_iter;
 840        } else {
 841                fn = (io_fn_t)file->f_op->write;
 842                fnv = file->f_op->aio_write;
 843                iter_fn = file->f_op->write_iter;
 844                file_start_write(file);
 845        }
 846
 847        if (iter_fn)
 848                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
 849                                                pos, iter_fn);
 850        else if (fnv)
 851                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 852                                                pos, fnv);
 853        else
 854                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 855
 856        if (type != READ)
 857                file_end_write(file);
 858
 859out:
 860        if (iov != iovstack)
 861                kfree(iov);
 862        if ((ret + (type == READ)) > 0) {
 863                if (type == READ)
 864                        fsnotify_access(file);
 865                else
 866                        fsnotify_modify(file);
 867        }
 868        return ret;
 869}
 870
 871ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 872                  unsigned long vlen, loff_t *pos)
 873{
 874        if (!(file->f_mode & FMODE_READ))
 875                return -EBADF;
 876        if (!(file->f_mode & FMODE_CAN_READ))
 877                return -EINVAL;
 878
 879        return do_readv_writev(READ, file, vec, vlen, pos);
 880}
 881
 882EXPORT_SYMBOL(vfs_readv);
 883
 884ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 885                   unsigned long vlen, loff_t *pos)
 886{
 887        if (!(file->f_mode & FMODE_WRITE))
 888                return -EBADF;
 889        if (!(file->f_mode & FMODE_CAN_WRITE))
 890                return -EINVAL;
 891
 892        return do_readv_writev(WRITE, file, vec, vlen, pos);
 893}
 894
 895EXPORT_SYMBOL(vfs_writev);
 896
 897SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 898                unsigned long, vlen)
 899{
 900        struct fd f = fdget_pos(fd);
 901        ssize_t ret = -EBADF;
 902
 903        if (f.file) {
 904                loff_t pos = file_pos_read(f.file);
 905                ret = vfs_readv(f.file, vec, vlen, &pos);
 906                if (ret >= 0)
 907                        file_pos_write(f.file, pos);
 908                fdput_pos(f);
 909        }
 910
 911        if (ret > 0)
 912                add_rchar(current, ret);
 913        inc_syscr(current);
 914        return ret;
 915}
 916
 917SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 918                unsigned long, vlen)
 919{
 920        struct fd f = fdget_pos(fd);
 921        ssize_t ret = -EBADF;
 922
 923        if (f.file) {
 924                loff_t pos = file_pos_read(f.file);
 925                ret = vfs_writev(f.file, vec, vlen, &pos);
 926                if (ret >= 0)
 927                        file_pos_write(f.file, pos);
 928                fdput_pos(f);
 929        }
 930
 931        if (ret > 0)
 932                add_wchar(current, ret);
 933        inc_syscw(current);
 934        return ret;
 935}
 936
 937static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 938{
 939#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 940        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 941}
 942
 943SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 944                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 945{
 946        loff_t pos = pos_from_hilo(pos_h, pos_l);
 947        struct fd f;
 948        ssize_t ret = -EBADF;
 949
 950        if (pos < 0)
 951                return -EINVAL;
 952
 953        f = fdget(fd);
 954        if (f.file) {
 955                ret = -ESPIPE;
 956                if (f.file->f_mode & FMODE_PREAD)
 957                        ret = vfs_readv(f.file, vec, vlen, &pos);
 958                fdput(f);
 959        }
 960
 961        if (ret > 0)
 962                add_rchar(current, ret);
 963        inc_syscr(current);
 964        return ret;
 965}
 966
 967SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 968                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 969{
 970        loff_t pos = pos_from_hilo(pos_h, pos_l);
 971        struct fd f;
 972        ssize_t ret = -EBADF;
 973
 974        if (pos < 0)
 975                return -EINVAL;
 976
 977        f = fdget(fd);
 978        if (f.file) {
 979                ret = -ESPIPE;
 980                if (f.file->f_mode & FMODE_PWRITE)
 981                        ret = vfs_writev(f.file, vec, vlen, &pos);
 982                fdput(f);
 983        }
 984
 985        if (ret > 0)
 986                add_wchar(current, ret);
 987        inc_syscw(current);
 988        return ret;
 989}
 990
 991#ifdef CONFIG_COMPAT
 992
 993static ssize_t compat_do_readv_writev(int type, struct file *file,
 994                               const struct compat_iovec __user *uvector,
 995                               unsigned long nr_segs, loff_t *pos)
 996{
 997        compat_ssize_t tot_len;
 998        struct iovec iovstack[UIO_FASTIOV];
 999        struct iovec *iov = iovstack;
1000        ssize_t ret;
1001        io_fn_t fn;
1002        iov_fn_t fnv;
1003        iter_fn_t iter_fn;
1004
1005        ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1006                                               UIO_FASTIOV, iovstack, &iov);
1007        if (ret <= 0)
1008                goto out;
1009
1010        tot_len = ret;
1011        ret = rw_verify_area(type, file, pos, tot_len);
1012        if (ret < 0)
1013                goto out;
1014
1015        fnv = NULL;
1016        if (type == READ) {
1017                fn = file->f_op->read;
1018                fnv = file->f_op->aio_read;
1019                iter_fn = file->f_op->read_iter;
1020        } else {
1021                fn = (io_fn_t)file->f_op->write;
1022                fnv = file->f_op->aio_write;
1023                iter_fn = file->f_op->write_iter;
1024                file_start_write(file);
1025        }
1026
1027        if (iter_fn)
1028                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
1029                                                pos, iter_fn);
1030        else if (fnv)
1031                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
1032                                                pos, fnv);
1033        else
1034                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
1035
1036        if (type != READ)
1037                file_end_write(file);
1038
1039out:
1040        if (iov != iovstack)
1041                kfree(iov);
1042        if ((ret + (type == READ)) > 0) {
1043                if (type == READ)
1044                        fsnotify_access(file);
1045                else
1046                        fsnotify_modify(file);
1047        }
1048        return ret;
1049}
1050
1051static size_t compat_readv(struct file *file,
1052                           const struct compat_iovec __user *vec,
1053                           unsigned long vlen, loff_t *pos)
1054{
1055        ssize_t ret = -EBADF;
1056
1057        if (!(file->f_mode & FMODE_READ))
1058                goto out;
1059
1060        ret = -EINVAL;
1061        if (!(file->f_mode & FMODE_CAN_READ))
1062                goto out;
1063
1064        ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
1065
1066out:
1067        if (ret > 0)
1068                add_rchar(current, ret);
1069        inc_syscr(current);
1070        return ret;
1071}
1072
1073COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1074                const struct compat_iovec __user *,vec,
1075                compat_ulong_t, vlen)
1076{
1077        struct fd f = fdget_pos(fd);
1078        ssize_t ret;
1079        loff_t pos;
1080
1081        if (!f.file)
1082                return -EBADF;
1083        pos = f.file->f_pos;
1084        ret = compat_readv(f.file, vec, vlen, &pos);
1085        if (ret >= 0)
1086                f.file->f_pos = pos;
1087        fdput_pos(f);
1088        return ret;
1089}
1090
1091static long __compat_sys_preadv64(unsigned long fd,
1092                                  const struct compat_iovec __user *vec,
1093                                  unsigned long vlen, loff_t pos)
1094{
1095        struct fd f;
1096        ssize_t ret;
1097
1098        if (pos < 0)
1099                return -EINVAL;
1100        f = fdget(fd);
1101        if (!f.file)
1102                return -EBADF;
1103        ret = -ESPIPE;
1104        if (f.file->f_mode & FMODE_PREAD)
1105                ret = compat_readv(f.file, vec, vlen, &pos);
1106        fdput(f);
1107        return ret;
1108}
1109
1110#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1111COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1112                const struct compat_iovec __user *,vec,
1113                unsigned long, vlen, loff_t, pos)
1114{
1115        return __compat_sys_preadv64(fd, vec, vlen, pos);
1116}
1117#endif
1118
1119COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1120                const struct compat_iovec __user *,vec,
1121                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1122{
1123        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1124
1125        return __compat_sys_preadv64(fd, vec, vlen, pos);
1126}
1127
1128static size_t compat_writev(struct file *file,
1129                            const struct compat_iovec __user *vec,
1130                            unsigned long vlen, loff_t *pos)
1131{
1132        ssize_t ret = -EBADF;
1133
1134        if (!(file->f_mode & FMODE_WRITE))
1135                goto out;
1136
1137        ret = -EINVAL;
1138        if (!(file->f_mode & FMODE_CAN_WRITE))
1139                goto out;
1140
1141        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1142
1143out:
1144        if (ret > 0)
1145                add_wchar(current, ret);
1146        inc_syscw(current);
1147        return ret;
1148}
1149
1150COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1151                const struct compat_iovec __user *, vec,
1152                compat_ulong_t, vlen)
1153{
1154        struct fd f = fdget_pos(fd);
1155        ssize_t ret;
1156        loff_t pos;
1157
1158        if (!f.file)
1159                return -EBADF;
1160        pos = f.file->f_pos;
1161        ret = compat_writev(f.file, vec, vlen, &pos);
1162        if (ret >= 0)
1163                f.file->f_pos = pos;
1164        fdput_pos(f);
1165        return ret;
1166}
1167
1168static long __compat_sys_pwritev64(unsigned long fd,
1169                                   const struct compat_iovec __user *vec,
1170                                   unsigned long vlen, loff_t pos)
1171{
1172        struct fd f;
1173        ssize_t ret;
1174
1175        if (pos < 0)
1176                return -EINVAL;
1177        f = fdget(fd);
1178        if (!f.file)
1179                return -EBADF;
1180        ret = -ESPIPE;
1181        if (f.file->f_mode & FMODE_PWRITE)
1182                ret = compat_writev(f.file, vec, vlen, &pos);
1183        fdput(f);
1184        return ret;
1185}
1186
1187#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1188COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1189                const struct compat_iovec __user *,vec,
1190                unsigned long, vlen, loff_t, pos)
1191{
1192        return __compat_sys_pwritev64(fd, vec, vlen, pos);
1193}
1194#endif
1195
1196COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1197                const struct compat_iovec __user *,vec,
1198                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1199{
1200        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1201
1202        return __compat_sys_pwritev64(fd, vec, vlen, pos);
1203}
1204#endif
1205
1206static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1207                           size_t count, loff_t max)
1208{
1209        struct fd in, out;
1210        struct inode *in_inode, *out_inode;
1211        loff_t pos;
1212        loff_t out_pos;
1213        ssize_t retval;
1214        int fl;
1215
1216        /*
1217         * Get input file, and verify that it is ok..
1218         */
1219        retval = -EBADF;
1220        in = fdget(in_fd);
1221        if (!in.file)
1222                goto out;
1223        if (!(in.file->f_mode & FMODE_READ))
1224                goto fput_in;
1225        retval = -ESPIPE;
1226        if (!ppos) {
1227                pos = in.file->f_pos;
1228        } else {
1229                pos = *ppos;
1230                if (!(in.file->f_mode & FMODE_PREAD))
1231                        goto fput_in;
1232        }
1233        retval = rw_verify_area(READ, in.file, &pos, count);
1234        if (retval < 0)
1235                goto fput_in;
1236        count = retval;
1237
1238        /*
1239         * Get output file, and verify that it is ok..
1240         */
1241        retval = -EBADF;
1242        out = fdget(out_fd);
1243        if (!out.file)
1244                goto fput_in;
1245        if (!(out.file->f_mode & FMODE_WRITE))
1246                goto fput_out;
1247        retval = -EINVAL;
1248        in_inode = file_inode(in.file);
1249        out_inode = file_inode(out.file);
1250        out_pos = out.file->f_pos;
1251        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1252        if (retval < 0)
1253                goto fput_out;
1254        count = retval;
1255
1256        if (!max)
1257                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1258
1259        if (unlikely(pos + count > max)) {
1260                retval = -EOVERFLOW;
1261                if (pos >= max)
1262                        goto fput_out;
1263                count = max - pos;
1264        }
1265
1266        fl = 0;
1267#if 0
1268        /*
1269         * We need to debate whether we can enable this or not. The
1270         * man page documents EAGAIN return for the output at least,
1271         * and the application is arguably buggy if it doesn't expect
1272         * EAGAIN on a non-blocking file descriptor.
1273         */
1274        if (in.file->f_flags & O_NONBLOCK)
1275                fl = SPLICE_F_NONBLOCK;
1276#endif
1277        file_start_write(out.file);
1278        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1279        file_end_write(out.file);
1280
1281        if (retval > 0) {
1282                add_rchar(current, retval);
1283                add_wchar(current, retval);
1284                fsnotify_access(in.file);
1285                fsnotify_modify(out.file);
1286                out.file->f_pos = out_pos;
1287                if (ppos)
1288                        *ppos = pos;
1289                else
1290                        in.file->f_pos = pos;
1291        }
1292
1293        inc_syscr(current);
1294        inc_syscw(current);
1295        if (pos > max)
1296                retval = -EOVERFLOW;
1297
1298fput_out:
1299        fdput(out);
1300fput_in:
1301        fdput(in);
1302out:
1303        return retval;
1304}
1305
1306SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1307{
1308        loff_t pos;
1309        off_t off;
1310        ssize_t ret;
1311
1312        if (offset) {
1313                if (unlikely(get_user(off, offset)))
1314                        return -EFAULT;
1315                pos = off;
1316                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1317                if (unlikely(put_user(pos, offset)))
1318                        return -EFAULT;
1319                return ret;
1320        }
1321
1322        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1323}
1324
1325SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1326{
1327        loff_t pos;
1328        ssize_t ret;
1329
1330        if (offset) {
1331                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1332                        return -EFAULT;
1333                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1334                if (unlikely(put_user(pos, offset)))
1335                        return -EFAULT;
1336                return ret;
1337        }
1338
1339        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1340}
1341
1342#ifdef CONFIG_COMPAT
1343COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1344                compat_off_t __user *, offset, compat_size_t, count)
1345{
1346        loff_t pos;
1347        off_t off;
1348        ssize_t ret;
1349
1350        if (offset) {
1351                if (unlikely(get_user(off, offset)))
1352                        return -EFAULT;
1353                pos = off;
1354                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1355                if (unlikely(put_user(pos, offset)))
1356                        return -EFAULT;
1357                return ret;
1358        }
1359
1360        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1361}
1362
1363COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1364                compat_loff_t __user *, offset, compat_size_t, count)
1365{
1366        loff_t pos;
1367        ssize_t ret;
1368
1369        if (offset) {
1370                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1371                        return -EFAULT;
1372                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1373                if (unlikely(put_user(pos, offset)))
1374                        return -EFAULT;
1375                return ret;
1376        }
1377
1378        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1379}
1380#endif
1381
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.