linux/fs/read_write.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/fsnotify.h>
  13#include <linux/security.h>
  14#include <linux/export.h>
  15#include <linux/syscalls.h>
  16#include <linux/pagemap.h>
  17#include <linux/splice.h>
  18#include <linux/compat.h>
  19#include "read_write.h"
  20#include "internal.h"
  21
  22#include <asm/uaccess.h>
  23#include <asm/unistd.h>
  24
  25const struct file_operations generic_ro_fops = {
  26        .llseek         = generic_file_llseek,
  27        .read           = do_sync_read,
  28        .aio_read       = generic_file_aio_read,
  29        .mmap           = generic_file_readonly_mmap,
  30        .splice_read    = generic_file_splice_read,
  31};
  32
  33EXPORT_SYMBOL(generic_ro_fops);
  34
  35static inline int unsigned_offsets(struct file *file)
  36{
  37        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  38}
  39
  40static loff_t lseek_execute(struct file *file, struct inode *inode,
  41                loff_t offset, loff_t maxsize)
  42{
  43        if (offset < 0 && !unsigned_offsets(file))
  44                return -EINVAL;
  45        if (offset > maxsize)
  46                return -EINVAL;
  47
  48        if (offset != file->f_pos) {
  49                file->f_pos = offset;
  50                file->f_version = 0;
  51        }
  52        return offset;
  53}
  54
  55/**
  56 * generic_file_llseek_size - generic llseek implementation for regular files
  57 * @file:       file structure to seek on
  58 * @offset:     file offset to seek to
  59 * @whence:     type of seek
  60 * @size:       max size of this file in file system
  61 * @eof:        offset used for SEEK_END position
  62 *
  63 * This is a variant of generic_file_llseek that allows passing in a custom
  64 * maximum file size and a custom EOF position, for e.g. hashed directories
  65 *
  66 * Synchronization:
  67 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  68 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  69 * read/writes behave like SEEK_SET against seeks.
  70 */
  71loff_t
  72generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  73                loff_t maxsize, loff_t eof)
  74{
  75        struct inode *inode = file->f_mapping->host;
  76
  77        switch (whence) {
  78        case SEEK_END:
  79                offset += eof;
  80                break;
  81        case SEEK_CUR:
  82                /*
  83                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  84                 * position-querying operation.  Avoid rewriting the "same"
  85                 * f_pos value back to the file because a concurrent read(),
  86                 * write() or lseek() might have altered it
  87                 */
  88                if (offset == 0)
  89                        return file->f_pos;
  90                /*
  91                 * f_lock protects against read/modify/write race with other
  92                 * SEEK_CURs. Note that parallel writes and reads behave
  93                 * like SEEK_SET.
  94                 */
  95                spin_lock(&file->f_lock);
  96                offset = lseek_execute(file, inode, file->f_pos + offset,
  97                                       maxsize);
  98                spin_unlock(&file->f_lock);
  99                return offset;
 100        case SEEK_DATA:
 101                /*
 102                 * In the generic case the entire file is data, so as long as
 103                 * offset isn't at the end of the file then the offset is data.
 104                 */
 105                if (offset >= eof)
 106                        return -ENXIO;
 107                break;
 108        case SEEK_HOLE:
 109                /*
 110                 * There is a virtual hole at the end of the file, so as long as
 111                 * offset isn't i_size or larger, return i_size.
 112                 */
 113                if (offset >= eof)
 114                        return -ENXIO;
 115                offset = eof;
 116                break;
 117        }
 118
 119        return lseek_execute(file, inode, offset, maxsize);
 120}
 121EXPORT_SYMBOL(generic_file_llseek_size);
 122
 123/**
 124 * generic_file_llseek - generic llseek implementation for regular files
 125 * @file:       file structure to seek on
 126 * @offset:     file offset to seek to
 127 * @whence:     type of seek
 128 *
 129 * This is a generic implemenation of ->llseek useable for all normal local
 130 * filesystems.  It just updates the file offset to the value specified by
 131 * @offset and @whence under i_mutex.
 132 */
 133loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 134{
 135        struct inode *inode = file->f_mapping->host;
 136
 137        return generic_file_llseek_size(file, offset, whence,
 138                                        inode->i_sb->s_maxbytes,
 139                                        i_size_read(inode));
 140}
 141EXPORT_SYMBOL(generic_file_llseek);
 142
 143/**
 144 * noop_llseek - No Operation Performed llseek implementation
 145 * @file:       file structure to seek on
 146 * @offset:     file offset to seek to
 147 * @whence:     type of seek
 148 *
 149 * This is an implementation of ->llseek useable for the rare special case when
 150 * userspace expects the seek to succeed but the (device) file is actually not
 151 * able to perform the seek. In this case you use noop_llseek() instead of
 152 * falling back to the default implementation of ->llseek.
 153 */
 154loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 155{
 156        return file->f_pos;
 157}
 158EXPORT_SYMBOL(noop_llseek);
 159
 160loff_t no_llseek(struct file *file, loff_t offset, int whence)
 161{
 162        return -ESPIPE;
 163}
 164EXPORT_SYMBOL(no_llseek);
 165
 166loff_t default_llseek(struct file *file, loff_t offset, int whence)
 167{
 168        struct inode *inode = file_inode(file);
 169        loff_t retval;
 170
 171        mutex_lock(&inode->i_mutex);
 172        switch (whence) {
 173                case SEEK_END:
 174                        offset += i_size_read(inode);
 175                        break;
 176                case SEEK_CUR:
 177                        if (offset == 0) {
 178                                retval = file->f_pos;
 179                                goto out;
 180                        }
 181                        offset += file->f_pos;
 182                        break;
 183                case SEEK_DATA:
 184                        /*
 185                         * In the generic case the entire file is data, so as
 186                         * long as offset isn't at the end of the file then the
 187                         * offset is data.
 188                         */
 189                        if (offset >= inode->i_size) {
 190                                retval = -ENXIO;
 191                                goto out;
 192                        }
 193                        break;
 194                case SEEK_HOLE:
 195                        /*
 196                         * There is a virtual hole at the end of the file, so
 197                         * as long as offset isn't i_size or larger, return
 198                         * i_size.
 199                         */
 200                        if (offset >= inode->i_size) {
 201                                retval = -ENXIO;
 202                                goto out;
 203                        }
 204                        offset = inode->i_size;
 205                        break;
 206        }
 207        retval = -EINVAL;
 208        if (offset >= 0 || unsigned_offsets(file)) {
 209                if (offset != file->f_pos) {
 210                        file->f_pos = offset;
 211                        file->f_version = 0;
 212                }
 213                retval = offset;
 214        }
 215out:
 216        mutex_unlock(&inode->i_mutex);
 217        return retval;
 218}
 219EXPORT_SYMBOL(default_llseek);
 220
 221loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 222{
 223        loff_t (*fn)(struct file *, loff_t, int);
 224
 225        fn = no_llseek;
 226        if (file->f_mode & FMODE_LSEEK) {
 227                if (file->f_op && file->f_op->llseek)
 228                        fn = file->f_op->llseek;
 229        }
 230        return fn(file, offset, whence);
 231}
 232EXPORT_SYMBOL(vfs_llseek);
 233
 234SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 235{
 236        off_t retval;
 237        struct fd f = fdget(fd);
 238        if (!f.file)
 239                return -EBADF;
 240
 241        retval = -EINVAL;
 242        if (whence <= SEEK_MAX) {
 243                loff_t res = vfs_llseek(f.file, offset, whence);
 244                retval = res;
 245                if (res != (loff_t)retval)
 246                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 247        }
 248        fdput(f);
 249        return retval;
 250}
 251
 252#ifdef CONFIG_COMPAT
 253COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 254{
 255        return sys_lseek(fd, offset, whence);
 256}
 257#endif
 258
 259#ifdef __ARCH_WANT_SYS_LLSEEK
 260SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 261                unsigned long, offset_low, loff_t __user *, result,
 262                unsigned int, whence)
 263{
 264        int retval;
 265        struct fd f = fdget(fd);
 266        loff_t offset;
 267
 268        if (!f.file)
 269                return -EBADF;
 270
 271        retval = -EINVAL;
 272        if (whence > SEEK_MAX)
 273                goto out_putf;
 274
 275        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 276                        whence);
 277
 278        retval = (int)offset;
 279        if (offset >= 0) {
 280                retval = -EFAULT;
 281                if (!copy_to_user(result, &offset, sizeof(offset)))
 282                        retval = 0;
 283        }
 284out_putf:
 285        fdput(f);
 286        return retval;
 287}
 288#endif
 289
 290/*
 291 * rw_verify_area doesn't like huge counts. We limit
 292 * them to something that fits in "int" so that others
 293 * won't have to do range checks all the time.
 294 */
 295int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 296{
 297        struct inode *inode;
 298        loff_t pos;
 299        int retval = -EINVAL;
 300
 301        inode = file_inode(file);
 302        if (unlikely((ssize_t) count < 0))
 303                return retval;
 304        pos = *ppos;
 305        if (unlikely(pos < 0)) {
 306                if (!unsigned_offsets(file))
 307                        return retval;
 308                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 309                        return -EOVERFLOW;
 310        } else if (unlikely((loff_t) (pos + count) < 0)) {
 311                if (!unsigned_offsets(file))
 312                        return retval;
 313        }
 314
 315        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 316                retval = locks_mandatory_area(
 317                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 318                        inode, file, pos, count);
 319                if (retval < 0)
 320                        return retval;
 321        }
 322        retval = security_file_permission(file,
 323                                read_write == READ ? MAY_READ : MAY_WRITE);
 324        if (retval)
 325                return retval;
 326        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 327}
 328
 329static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
 330{
 331        set_current_state(TASK_UNINTERRUPTIBLE);
 332        if (!kiocbIsKicked(iocb))
 333                schedule();
 334        else
 335                kiocbClearKicked(iocb);
 336        __set_current_state(TASK_RUNNING);
 337}
 338
 339ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 340{
 341        struct iovec iov = { .iov_base = buf, .iov_len = len };
 342        struct kiocb kiocb;
 343        ssize_t ret;
 344
 345        init_sync_kiocb(&kiocb, filp);
 346        kiocb.ki_pos = *ppos;
 347        kiocb.ki_left = len;
 348        kiocb.ki_nbytes = len;
 349
 350        for (;;) {
 351                ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 352                if (ret != -EIOCBRETRY)
 353                        break;
 354                wait_on_retry_sync_kiocb(&kiocb);
 355        }
 356
 357        if (-EIOCBQUEUED == ret)
 358                ret = wait_on_sync_kiocb(&kiocb);
 359        *ppos = kiocb.ki_pos;
 360        return ret;
 361}
 362
 363EXPORT_SYMBOL(do_sync_read);
 364
 365ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 366{
 367        ssize_t ret;
 368
 369        if (!(file->f_mode & FMODE_READ))
 370                return -EBADF;
 371        if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
 372                return -EINVAL;
 373        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 374                return -EFAULT;
 375
 376        ret = rw_verify_area(READ, file, pos, count);
 377        if (ret >= 0) {
 378                count = ret;
 379                if (file->f_op->read)
 380                        ret = file->f_op->read(file, buf, count, pos);
 381                else
 382                        ret = do_sync_read(file, buf, count, pos);
 383                if (ret > 0) {
 384                        fsnotify_access(file);
 385                        add_rchar(current, ret);
 386                }
 387                inc_syscr(current);
 388        }
 389
 390        return ret;
 391}
 392
 393EXPORT_SYMBOL(vfs_read);
 394
 395ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 396{
 397        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 398        struct kiocb kiocb;
 399        ssize_t ret;
 400
 401        init_sync_kiocb(&kiocb, filp);
 402        kiocb.ki_pos = *ppos;
 403        kiocb.ki_left = len;
 404        kiocb.ki_nbytes = len;
 405
 406        for (;;) {
 407                ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 408                if (ret != -EIOCBRETRY)
 409                        break;
 410                wait_on_retry_sync_kiocb(&kiocb);
 411        }
 412
 413        if (-EIOCBQUEUED == ret)
 414                ret = wait_on_sync_kiocb(&kiocb);
 415        *ppos = kiocb.ki_pos;
 416        return ret;
 417}
 418
 419EXPORT_SYMBOL(do_sync_write);
 420
 421ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 422{
 423        mm_segment_t old_fs;
 424        const char __user *p;
 425        ssize_t ret;
 426
 427        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 428                return -EINVAL;
 429
 430        old_fs = get_fs();
 431        set_fs(get_ds());
 432        p = (__force const char __user *)buf;
 433        if (count > MAX_RW_COUNT)
 434                count =  MAX_RW_COUNT;
 435        if (file->f_op->write)
 436                ret = file->f_op->write(file, p, count, pos);
 437        else
 438                ret = do_sync_write(file, p, count, pos);
 439        set_fs(old_fs);
 440        if (ret > 0) {
 441                fsnotify_modify(file);
 442                add_wchar(current, ret);
 443        }
 444        inc_syscw(current);
 445        return ret;
 446}
 447
 448ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 449{
 450        ssize_t ret;
 451
 452        if (!(file->f_mode & FMODE_WRITE))
 453                return -EBADF;
 454        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 455                return -EINVAL;
 456        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 457                return -EFAULT;
 458
 459        ret = rw_verify_area(WRITE, file, pos, count);
 460        if (ret >= 0) {
 461                count = ret;
 462                if (file->f_op->write)
 463                        ret = file->f_op->write(file, buf, count, pos);
 464                else
 465                        ret = do_sync_write(file, buf, count, pos);
 466                if (ret > 0) {
 467                        fsnotify_modify(file);
 468                        add_wchar(current, ret);
 469                }
 470                inc_syscw(current);
 471        }
 472
 473        return ret;
 474}
 475
 476EXPORT_SYMBOL(vfs_write);
 477
 478static inline loff_t file_pos_read(struct file *file)
 479{
 480        return file->f_pos;
 481}
 482
 483static inline void file_pos_write(struct file *file, loff_t pos)
 484{
 485        file->f_pos = pos;
 486}
 487
 488SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 489{
 490        struct fd f = fdget(fd);
 491        ssize_t ret = -EBADF;
 492
 493        if (f.file) {
 494                loff_t pos = file_pos_read(f.file);
 495                ret = vfs_read(f.file, buf, count, &pos);
 496                file_pos_write(f.file, pos);
 497                fdput(f);
 498        }
 499        return ret;
 500}
 501
 502SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 503                size_t, count)
 504{
 505        struct fd f = fdget(fd);
 506        ssize_t ret = -EBADF;
 507
 508        if (f.file) {
 509                loff_t pos = file_pos_read(f.file);
 510                ret = vfs_write(f.file, buf, count, &pos);
 511                file_pos_write(f.file, pos);
 512                fdput(f);
 513        }
 514
 515        return ret;
 516}
 517
 518SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
 519                        size_t count, loff_t pos)
 520{
 521        struct fd f;
 522        ssize_t ret = -EBADF;
 523
 524        if (pos < 0)
 525                return -EINVAL;
 526
 527        f = fdget(fd);
 528        if (f.file) {
 529                ret = -ESPIPE;
 530                if (f.file->f_mode & FMODE_PREAD)
 531                        ret = vfs_read(f.file, buf, count, &pos);
 532                fdput(f);
 533        }
 534
 535        return ret;
 536}
 537#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 538asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
 539{
 540        return SYSC_pread64((unsigned int) fd, (char __user *) buf,
 541                            (size_t) count, pos);
 542}
 543SYSCALL_ALIAS(sys_pread64, SyS_pread64);
 544#endif
 545
 546SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
 547                         size_t count, loff_t pos)
 548{
 549        struct fd f;
 550        ssize_t ret = -EBADF;
 551
 552        if (pos < 0)
 553                return -EINVAL;
 554
 555        f = fdget(fd);
 556        if (f.file) {
 557                ret = -ESPIPE;
 558                if (f.file->f_mode & FMODE_PWRITE)  
 559                        ret = vfs_write(f.file, buf, count, &pos);
 560                fdput(f);
 561        }
 562
 563        return ret;
 564}
 565#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 566asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
 567{
 568        return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
 569                             (size_t) count, pos);
 570}
 571SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
 572#endif
 573
 574/*
 575 * Reduce an iovec's length in-place.  Return the resulting number of segments
 576 */
 577unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 578{
 579        unsigned long seg = 0;
 580        size_t len = 0;
 581
 582        while (seg < nr_segs) {
 583                seg++;
 584                if (len + iov->iov_len >= to) {
 585                        iov->iov_len = to - len;
 586                        break;
 587                }
 588                len += iov->iov_len;
 589                iov++;
 590        }
 591        return seg;
 592}
 593EXPORT_SYMBOL(iov_shorten);
 594
 595ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 596                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 597{
 598        struct kiocb kiocb;
 599        ssize_t ret;
 600
 601        init_sync_kiocb(&kiocb, filp);
 602        kiocb.ki_pos = *ppos;
 603        kiocb.ki_left = len;
 604        kiocb.ki_nbytes = len;
 605
 606        for (;;) {
 607                ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 608                if (ret != -EIOCBRETRY)
 609                        break;
 610                wait_on_retry_sync_kiocb(&kiocb);
 611        }
 612
 613        if (ret == -EIOCBQUEUED)
 614                ret = wait_on_sync_kiocb(&kiocb);
 615        *ppos = kiocb.ki_pos;
 616        return ret;
 617}
 618
 619/* Do it by hand, with file-ops */
 620ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 621                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 622{
 623        struct iovec *vector = iov;
 624        ssize_t ret = 0;
 625
 626        while (nr_segs > 0) {
 627                void __user *base;
 628                size_t len;
 629                ssize_t nr;
 630
 631                base = vector->iov_base;
 632                len = vector->iov_len;
 633                vector++;
 634                nr_segs--;
 635
 636                nr = fn(filp, base, len, ppos);
 637
 638                if (nr < 0) {
 639                        if (!ret)
 640                                ret = nr;
 641                        break;
 642                }
 643                ret += nr;
 644                if (nr != len)
 645                        break;
 646        }
 647
 648        return ret;
 649}
 650
 651/* A write operation does a read from user space and vice versa */
 652#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 653
 654ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 655                              unsigned long nr_segs, unsigned long fast_segs,
 656                              struct iovec *fast_pointer,
 657                              struct iovec **ret_pointer)
 658{
 659        unsigned long seg;
 660        ssize_t ret;
 661        struct iovec *iov = fast_pointer;
 662
 663        /*
 664         * SuS says "The readv() function *may* fail if the iovcnt argument
 665         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 666         * traditionally returned zero for zero segments, so...
 667         */
 668        if (nr_segs == 0) {
 669                ret = 0;
 670                goto out;
 671        }
 672
 673        /*
 674         * First get the "struct iovec" from user memory and
 675         * verify all the pointers
 676         */
 677        if (nr_segs > UIO_MAXIOV) {
 678                ret = -EINVAL;
 679                goto out;
 680        }
 681        if (nr_segs > fast_segs) {
 682                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 683                if (iov == NULL) {
 684                        ret = -ENOMEM;
 685                        goto out;
 686                }
 687        }
 688        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 689                ret = -EFAULT;
 690                goto out;
 691        }
 692
 693        /*
 694         * According to the Single Unix Specification we should return EINVAL
 695         * if an element length is < 0 when cast to ssize_t or if the
 696         * total length would overflow the ssize_t return value of the
 697         * system call.
 698         *
 699         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 700         * overflow case.
 701         */
 702        ret = 0;
 703        for (seg = 0; seg < nr_segs; seg++) {
 704                void __user *buf = iov[seg].iov_base;
 705                ssize_t len = (ssize_t)iov[seg].iov_len;
 706
 707                /* see if we we're about to use an invalid len or if
 708                 * it's about to overflow ssize_t */
 709                if (len < 0) {
 710                        ret = -EINVAL;
 711                        goto out;
 712                }
 713                if (type >= 0
 714                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 715                        ret = -EFAULT;
 716                        goto out;
 717                }
 718                if (len > MAX_RW_COUNT - ret) {
 719                        len = MAX_RW_COUNT - ret;
 720                        iov[seg].iov_len = len;
 721                }
 722                ret += len;
 723        }
 724out:
 725        *ret_pointer = iov;
 726        return ret;
 727}
 728
 729static ssize_t do_readv_writev(int type, struct file *file,
 730                               const struct iovec __user * uvector,
 731                               unsigned long nr_segs, loff_t *pos)
 732{
 733        size_t tot_len;
 734        struct iovec iovstack[UIO_FASTIOV];
 735        struct iovec *iov = iovstack;
 736        ssize_t ret;
 737        io_fn_t fn;
 738        iov_fn_t fnv;
 739
 740        if (!file->f_op) {
 741                ret = -EINVAL;
 742                goto out;
 743        }
 744
 745        ret = rw_copy_check_uvector(type, uvector, nr_segs,
 746                                    ARRAY_SIZE(iovstack), iovstack, &iov);
 747        if (ret <= 0)
 748                goto out;
 749
 750        tot_len = ret;
 751        ret = rw_verify_area(type, file, pos, tot_len);
 752        if (ret < 0)
 753                goto out;
 754
 755        fnv = NULL;
 756        if (type == READ) {
 757                fn = file->f_op->read;
 758                fnv = file->f_op->aio_read;
 759        } else {
 760                fn = (io_fn_t)file->f_op->write;
 761                fnv = file->f_op->aio_write;
 762        }
 763
 764        if (fnv)
 765                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 766                                                pos, fnv);
 767        else
 768                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 769
 770out:
 771        if (iov != iovstack)
 772                kfree(iov);
 773        if ((ret + (type == READ)) > 0) {
 774                if (type == READ)
 775                        fsnotify_access(file);
 776                else
 777                        fsnotify_modify(file);
 778        }
 779        return ret;
 780}
 781
 782ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 783                  unsigned long vlen, loff_t *pos)
 784{
 785        if (!(file->f_mode & FMODE_READ))
 786                return -EBADF;
 787        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 788                return -EINVAL;
 789
 790        return do_readv_writev(READ, file, vec, vlen, pos);
 791}
 792
 793EXPORT_SYMBOL(vfs_readv);
 794
 795ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 796                   unsigned long vlen, loff_t *pos)
 797{
 798        if (!(file->f_mode & FMODE_WRITE))
 799                return -EBADF;
 800        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 801                return -EINVAL;
 802
 803        return do_readv_writev(WRITE, file, vec, vlen, pos);
 804}
 805
 806EXPORT_SYMBOL(vfs_writev);
 807
 808SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 809                unsigned long, vlen)
 810{
 811        struct fd f = fdget(fd);
 812        ssize_t ret = -EBADF;
 813
 814        if (f.file) {
 815                loff_t pos = file_pos_read(f.file);
 816                ret = vfs_readv(f.file, vec, vlen, &pos);
 817                file_pos_write(f.file, pos);
 818                fdput(f);
 819        }
 820
 821        if (ret > 0)
 822                add_rchar(current, ret);
 823        inc_syscr(current);
 824        return ret;
 825}
 826
 827SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 828                unsigned long, vlen)
 829{
 830        struct fd f = fdget(fd);
 831        ssize_t ret = -EBADF;
 832
 833        if (f.file) {
 834                loff_t pos = file_pos_read(f.file);
 835                ret = vfs_writev(f.file, vec, vlen, &pos);
 836                file_pos_write(f.file, pos);
 837                fdput(f);
 838        }
 839
 840        if (ret > 0)
 841                add_wchar(current, ret);
 842        inc_syscw(current);
 843        return ret;
 844}
 845
 846static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 847{
 848#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 849        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 850}
 851
 852SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 853                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 854{
 855        loff_t pos = pos_from_hilo(pos_h, pos_l);
 856        struct fd f;
 857        ssize_t ret = -EBADF;
 858
 859        if (pos < 0)
 860                return -EINVAL;
 861
 862        f = fdget(fd);
 863        if (f.file) {
 864                ret = -ESPIPE;
 865                if (f.file->f_mode & FMODE_PREAD)
 866                        ret = vfs_readv(f.file, vec, vlen, &pos);
 867                fdput(f);
 868        }
 869
 870        if (ret > 0)
 871                add_rchar(current, ret);
 872        inc_syscr(current);
 873        return ret;
 874}
 875
 876SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 877                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 878{
 879        loff_t pos = pos_from_hilo(pos_h, pos_l);
 880        struct fd f;
 881        ssize_t ret = -EBADF;
 882
 883        if (pos < 0)
 884                return -EINVAL;
 885
 886        f = fdget(fd);
 887        if (f.file) {
 888                ret = -ESPIPE;
 889                if (f.file->f_mode & FMODE_PWRITE)
 890                        ret = vfs_writev(f.file, vec, vlen, &pos);
 891                fdput(f);
 892        }
 893
 894        if (ret > 0)
 895                add_wchar(current, ret);
 896        inc_syscw(current);
 897        return ret;
 898}
 899
 900ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
 901                    loff_t max)
 902{
 903        struct fd in, out;
 904        struct inode *in_inode, *out_inode;
 905        loff_t pos;
 906        ssize_t retval;
 907        int fl;
 908
 909        /*
 910         * Get input file, and verify that it is ok..
 911         */
 912        retval = -EBADF;
 913        in = fdget(in_fd);
 914        if (!in.file)
 915                goto out;
 916        if (!(in.file->f_mode & FMODE_READ))
 917                goto fput_in;
 918        retval = -ESPIPE;
 919        if (!ppos)
 920                ppos = &in.file->f_pos;
 921        else
 922                if (!(in.file->f_mode & FMODE_PREAD))
 923                        goto fput_in;
 924        retval = rw_verify_area(READ, in.file, ppos, count);
 925        if (retval < 0)
 926                goto fput_in;
 927        count = retval;
 928
 929        /*
 930         * Get output file, and verify that it is ok..
 931         */
 932        retval = -EBADF;
 933        out = fdget(out_fd);
 934        if (!out.file)
 935                goto fput_in;
 936        if (!(out.file->f_mode & FMODE_WRITE))
 937                goto fput_out;
 938        retval = -EINVAL;
 939        in_inode = file_inode(in.file);
 940        out_inode = file_inode(out.file);
 941        retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
 942        if (retval < 0)
 943                goto fput_out;
 944        count = retval;
 945
 946        if (!max)
 947                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
 948
 949        pos = *ppos;
 950        if (unlikely(pos + count > max)) {
 951                retval = -EOVERFLOW;
 952                if (pos >= max)
 953                        goto fput_out;
 954                count = max - pos;
 955        }
 956
 957        fl = 0;
 958#if 0
 959        /*
 960         * We need to debate whether we can enable this or not. The
 961         * man page documents EAGAIN return for the output at least,
 962         * and the application is arguably buggy if it doesn't expect
 963         * EAGAIN on a non-blocking file descriptor.
 964         */
 965        if (in.file->f_flags & O_NONBLOCK)
 966                fl = SPLICE_F_NONBLOCK;
 967#endif
 968        retval = do_splice_direct(in.file, ppos, out.file, count, fl);
 969
 970        if (retval > 0) {
 971                add_rchar(current, retval);
 972                add_wchar(current, retval);
 973                fsnotify_access(in.file);
 974                fsnotify_modify(out.file);
 975        }
 976
 977        inc_syscr(current);
 978        inc_syscw(current);
 979        if (*ppos > max)
 980                retval = -EOVERFLOW;
 981
 982fput_out:
 983        fdput(out);
 984fput_in:
 985        fdput(in);
 986out:
 987        return retval;
 988}
 989
 990SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
 991{
 992        loff_t pos;
 993        off_t off;
 994        ssize_t ret;
 995
 996        if (offset) {
 997                if (unlikely(get_user(off, offset)))
 998                        return -EFAULT;
 999                pos = off;
1000                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1001                if (unlikely(put_user(pos, offset)))
1002                        return -EFAULT;
1003                return ret;
1004        }
1005
1006        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1007}
1008
1009SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1010{
1011        loff_t pos;
1012        ssize_t ret;
1013
1014        if (offset) {
1015                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1016                        return -EFAULT;
1017                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1018                if (unlikely(put_user(pos, offset)))
1019                        return -EFAULT;
1020                return ret;
1021        }
1022
1023        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1024}
1025
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.