linux/fs/splice.c
<<
>>
Prefs
   1/*
   2 * "splice": joining two ropes together by interweaving their strands.
   3 *
   4 * This is the "extended pipe" functionality, where a pipe is used as
   5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   6 * buffer that you can use to transfer data from one end to the other.
   7 *
   8 * The traditional unix read/write is extended with a "splice()" operation
   9 * that transfers data buffers to or from a pipe buffer.
  10 *
  11 * Named by Larry McVoy, original implementation from Linus, extended by
  12 * Jens to support splicing to files, network, direct splicing, etc and
  13 * fixing lots of bugs.
  14 *
  15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  18 *
  19 */
  20#include <linux/fs.h>
  21#include <linux/file.h>
  22#include <linux/pagemap.h>
  23#include <linux/splice.h>
  24#include <linux/memcontrol.h>
  25#include <linux/mm_inline.h>
  26#include <linux/swap.h>
  27#include <linux/writeback.h>
  28#include <linux/buffer_head.h>
  29#include <linux/module.h>
  30#include <linux/syscalls.h>
  31#include <linux/uio.h>
  32#include <linux/security.h>
  33
  34/*
  35 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  36 * a vm helper function, it's already simplified quite a bit by the
  37 * addition of remove_mapping(). If success is returned, the caller may
  38 * attempt to reuse this page for another destination.
  39 */
  40static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
  41                                     struct pipe_buffer *buf)
  42{
  43        struct page *page = buf->page;
  44        struct address_space *mapping;
  45
  46        lock_page(page);
  47
  48        mapping = page_mapping(page);
  49        if (mapping) {
  50                WARN_ON(!PageUptodate(page));
  51
  52                /*
  53                 * At least for ext2 with nobh option, we need to wait on
  54                 * writeback completing on this page, since we'll remove it
  55                 * from the pagecache.  Otherwise truncate wont wait on the
  56                 * page, allowing the disk blocks to be reused by someone else
  57                 * before we actually wrote our data to them. fs corruption
  58                 * ensues.
  59                 */
  60                wait_on_page_writeback(page);
  61
  62                if (page_has_private(page) &&
  63                    !try_to_release_page(page, GFP_KERNEL))
  64                        goto out_unlock;
  65
  66                /*
  67                 * If we succeeded in removing the mapping, set LRU flag
  68                 * and return good.
  69                 */
  70                if (remove_mapping(mapping, page)) {
  71                        buf->flags |= PIPE_BUF_FLAG_LRU;
  72                        return 0;
  73                }
  74        }
  75
  76        /*
  77         * Raced with truncate or failed to remove page from current
  78         * address space, unlock and return failure.
  79         */
  80out_unlock:
  81        unlock_page(page);
  82        return 1;
  83}
  84
  85static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  86                                        struct pipe_buffer *buf)
  87{
  88        page_cache_release(buf->page);
  89        buf->flags &= ~PIPE_BUF_FLAG_LRU;
  90}
  91
  92/*
  93 * Check whether the contents of buf is OK to access. Since the content
  94 * is a page cache page, IO may be in flight.
  95 */
  96static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
  97                                       struct pipe_buffer *buf)
  98{
  99        struct page *page = buf->page;
 100        int err;
 101
 102        if (!PageUptodate(page)) {
 103                lock_page(page);
 104
 105                /*
 106                 * Page got truncated/unhashed. This will cause a 0-byte
 107                 * splice, if this is the first page.
 108                 */
 109                if (!page->mapping) {
 110                        err = -ENODATA;
 111                        goto error;
 112                }
 113
 114                /*
 115                 * Uh oh, read-error from disk.
 116                 */
 117                if (!PageUptodate(page)) {
 118                        err = -EIO;
 119                        goto error;
 120                }
 121
 122                /*
 123                 * Page is ok afterall, we are done.
 124                 */
 125                unlock_page(page);
 126        }
 127
 128        return 0;
 129error:
 130        unlock_page(page);
 131        return err;
 132}
 133
 134static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 135        .can_merge = 0,
 136        .map = generic_pipe_buf_map,
 137        .unmap = generic_pipe_buf_unmap,
 138        .confirm = page_cache_pipe_buf_confirm,
 139        .release = page_cache_pipe_buf_release,
 140        .steal = page_cache_pipe_buf_steal,
 141        .get = generic_pipe_buf_get,
 142};
 143
 144static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 145                                    struct pipe_buffer *buf)
 146{
 147        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 148                return 1;
 149
 150        buf->flags |= PIPE_BUF_FLAG_LRU;
 151        return generic_pipe_buf_steal(pipe, buf);
 152}
 153
 154static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 155        .can_merge = 0,
 156        .map = generic_pipe_buf_map,
 157        .unmap = generic_pipe_buf_unmap,
 158        .confirm = generic_pipe_buf_confirm,
 159        .release = page_cache_pipe_buf_release,
 160        .steal = user_page_pipe_buf_steal,
 161        .get = generic_pipe_buf_get,
 162};
 163
 164/**
 165 * splice_to_pipe - fill passed data into a pipe
 166 * @pipe:       pipe to fill
 167 * @spd:        data to fill
 168 *
 169 * Description:
 170 *    @spd contains a map of pages and len/offset tuples, along with
 171 *    the struct pipe_buf_operations associated with these pages. This
 172 *    function will link that data to the pipe.
 173 *
 174 */
 175ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 176                       struct splice_pipe_desc *spd)
 177{
 178        unsigned int spd_pages = spd->nr_pages;
 179        int ret, do_wakeup, page_nr;
 180
 181        ret = 0;
 182        do_wakeup = 0;
 183        page_nr = 0;
 184
 185        pipe_lock(pipe);
 186
 187        for (;;) {
 188                if (!pipe->readers) {
 189                        send_sig(SIGPIPE, current, 0);
 190                        if (!ret)
 191                                ret = -EPIPE;
 192                        break;
 193                }
 194
 195                if (pipe->nrbufs < PIPE_BUFFERS) {
 196                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 197                        struct pipe_buffer *buf = pipe->bufs + newbuf;
 198
 199                        buf->page = spd->pages[page_nr];
 200                        buf->offset = spd->partial[page_nr].offset;
 201                        buf->len = spd->partial[page_nr].len;
 202                        buf->private = spd->partial[page_nr].private;
 203                        buf->ops = spd->ops;
 204                        if (spd->flags & SPLICE_F_GIFT)
 205                                buf->flags |= PIPE_BUF_FLAG_GIFT;
 206
 207                        pipe->nrbufs++;
 208                        page_nr++;
 209                        ret += buf->len;
 210
 211                        if (pipe->inode)
 212                                do_wakeup = 1;
 213
 214                        if (!--spd->nr_pages)
 215                                break;
 216                        if (pipe->nrbufs < PIPE_BUFFERS)
 217                                continue;
 218
 219                        break;
 220                }
 221
 222                if (spd->flags & SPLICE_F_NONBLOCK) {
 223                        if (!ret)
 224                                ret = -EAGAIN;
 225                        break;
 226                }
 227
 228                if (signal_pending(current)) {
 229                        if (!ret)
 230                                ret = -ERESTARTSYS;
 231                        break;
 232                }
 233
 234                if (do_wakeup) {
 235                        smp_mb();
 236                        if (waitqueue_active(&pipe->wait))
 237                                wake_up_interruptible_sync(&pipe->wait);
 238                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 239                        do_wakeup = 0;
 240                }
 241
 242                pipe->waiting_writers++;
 243                pipe_wait(pipe);
 244                pipe->waiting_writers--;
 245        }
 246
 247        pipe_unlock(pipe);
 248
 249        if (do_wakeup) {
 250                smp_mb();
 251                if (waitqueue_active(&pipe->wait))
 252                        wake_up_interruptible(&pipe->wait);
 253                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 254        }
 255
 256        while (page_nr < spd_pages)
 257                spd->spd_release(spd, page_nr++);
 258
 259        return ret;
 260}
 261
 262static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 263{
 264        page_cache_release(spd->pages[i]);
 265}
 266
 267static int
 268__generic_file_splice_read(struct file *in, loff_t *ppos,
 269                           struct pipe_inode_info *pipe, size_t len,
 270                           unsigned int flags)
 271{
 272        struct address_space *mapping = in->f_mapping;
 273        unsigned int loff, nr_pages, req_pages;
 274        struct page *pages[PIPE_BUFFERS];
 275        struct partial_page partial[PIPE_BUFFERS];
 276        struct page *page;
 277        pgoff_t index, end_index;
 278        loff_t isize;
 279        int error, page_nr;
 280        struct splice_pipe_desc spd = {
 281                .pages = pages,
 282                .partial = partial,
 283                .flags = flags,
 284                .ops = &page_cache_pipe_buf_ops,
 285                .spd_release = spd_release_page,
 286        };
 287
 288        index = *ppos >> PAGE_CACHE_SHIFT;
 289        loff = *ppos & ~PAGE_CACHE_MASK;
 290        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 291        nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
 292
 293        /*
 294         * Lookup the (hopefully) full range of pages we need.
 295         */
 296        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
 297        index += spd.nr_pages;
 298
 299        /*
 300         * If find_get_pages_contig() returned fewer pages than we needed,
 301         * readahead/allocate the rest and fill in the holes.
 302         */
 303        if (spd.nr_pages < nr_pages)
 304                page_cache_sync_readahead(mapping, &in->f_ra, in,
 305                                index, req_pages - spd.nr_pages);
 306
 307        error = 0;
 308        while (spd.nr_pages < nr_pages) {
 309                /*
 310                 * Page could be there, find_get_pages_contig() breaks on
 311                 * the first hole.
 312                 */
 313                page = find_get_page(mapping, index);
 314                if (!page) {
 315                        /*
 316                         * page didn't exist, allocate one.
 317                         */
 318                        page = page_cache_alloc_cold(mapping);
 319                        if (!page)
 320                                break;
 321
 322                        error = add_to_page_cache_lru(page, mapping, index,
 323                                                mapping_gfp_mask(mapping));
 324                        if (unlikely(error)) {
 325                                page_cache_release(page);
 326                                if (error == -EEXIST)
 327                                        continue;
 328                                break;
 329                        }
 330                        /*
 331                         * add_to_page_cache() locks the page, unlock it
 332                         * to avoid convoluting the logic below even more.
 333                         */
 334                        unlock_page(page);
 335                }
 336
 337                pages[spd.nr_pages++] = page;
 338                index++;
 339        }
 340
 341        /*
 342         * Now loop over the map and see if we need to start IO on any
 343         * pages, fill in the partial map, etc.
 344         */
 345        index = *ppos >> PAGE_CACHE_SHIFT;
 346        nr_pages = spd.nr_pages;
 347        spd.nr_pages = 0;
 348        for (page_nr = 0; page_nr < nr_pages; page_nr++) {
 349                unsigned int this_len;
 350
 351                if (!len)
 352                        break;
 353
 354                /*
 355                 * this_len is the max we'll use from this page
 356                 */
 357                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
 358                page = pages[page_nr];
 359
 360                if (PageReadahead(page))
 361                        page_cache_async_readahead(mapping, &in->f_ra, in,
 362                                        page, index, req_pages - page_nr);
 363
 364                /*
 365                 * If the page isn't uptodate, we may need to start io on it
 366                 */
 367                if (!PageUptodate(page)) {
 368                        lock_page(page);
 369
 370                        /*
 371                         * Page was truncated, or invalidated by the
 372                         * filesystem.  Redo the find/create, but this time the
 373                         * page is kept locked, so there's no chance of another
 374                         * race with truncate/invalidate.
 375                         */
 376                        if (!page->mapping) {
 377                                unlock_page(page);
 378                                page = find_or_create_page(mapping, index,
 379                                                mapping_gfp_mask(mapping));
 380
 381                                if (!page) {
 382                                        error = -ENOMEM;
 383                                        break;
 384                                }
 385                                page_cache_release(pages[page_nr]);
 386                                pages[page_nr] = page;
 387                        }
 388                        /*
 389                         * page was already under io and is now done, great
 390                         */
 391                        if (PageUptodate(page)) {
 392                                unlock_page(page);
 393                                goto fill_it;
 394                        }
 395
 396                        /*
 397                         * need to read in the page
 398                         */
 399                        error = mapping->a_ops->readpage(in, page);
 400                        if (unlikely(error)) {
 401                                /*
 402                                 * We really should re-lookup the page here,
 403                                 * but it complicates things a lot. Instead
 404                                 * lets just do what we already stored, and
 405                                 * we'll get it the next time we are called.
 406                                 */
 407                                if (error == AOP_TRUNCATED_PAGE)
 408                                        error = 0;
 409
 410                                break;
 411                        }
 412                }
 413fill_it:
 414                /*
 415                 * i_size must be checked after PageUptodate.
 416                 */
 417                isize = i_size_read(mapping->host);
 418                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 419                if (unlikely(!isize || index > end_index))
 420                        break;
 421
 422                /*
 423                 * if this is the last page, see if we need to shrink
 424                 * the length and stop
 425                 */
 426                if (end_index == index) {
 427                        unsigned int plen;
 428
 429                        /*
 430                         * max good bytes in this page
 431                         */
 432                        plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 433                        if (plen <= loff)
 434                                break;
 435
 436                        /*
 437                         * force quit after adding this page
 438                         */
 439                        this_len = min(this_len, plen - loff);
 440                        len = this_len;
 441                }
 442
 443                partial[page_nr].offset = loff;
 444                partial[page_nr].len = this_len;
 445                len -= this_len;
 446                loff = 0;
 447                spd.nr_pages++;
 448                index++;
 449        }
 450
 451        /*
 452         * Release any pages at the end, if we quit early. 'page_nr' is how far
 453         * we got, 'nr_pages' is how many pages are in the map.
 454         */
 455        while (page_nr < nr_pages)
 456                page_cache_release(pages[page_nr++]);
 457        in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 458
 459        if (spd.nr_pages)
 460                return splice_to_pipe(pipe, &spd);
 461
 462        return error;
 463}
 464
 465/**
 466 * generic_file_splice_read - splice data from file to a pipe
 467 * @in:         file to splice from
 468 * @ppos:       position in @in
 469 * @pipe:       pipe to splice to
 470 * @len:        number of bytes to splice
 471 * @flags:      splice modifier flags
 472 *
 473 * Description:
 474 *    Will read pages from given file and fill them into a pipe. Can be
 475 *    used as long as the address_space operations for the source implements
 476 *    a readpage() hook.
 477 *
 478 */
 479ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 480                                 struct pipe_inode_info *pipe, size_t len,
 481                                 unsigned int flags)
 482{
 483        loff_t isize, left;
 484        int ret;
 485
 486        isize = i_size_read(in->f_mapping->host);
 487        if (unlikely(*ppos >= isize))
 488                return 0;
 489
 490        left = isize - *ppos;
 491        if (unlikely(left < len))
 492                len = left;
 493
 494        ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 495        if (ret > 0) {
 496                *ppos += ret;
 497                file_accessed(in);
 498        }
 499
 500        return ret;
 501}
 502EXPORT_SYMBOL(generic_file_splice_read);
 503
 504static const struct pipe_buf_operations default_pipe_buf_ops = {
 505        .can_merge = 0,
 506        .map = generic_pipe_buf_map,
 507        .unmap = generic_pipe_buf_unmap,
 508        .confirm = generic_pipe_buf_confirm,
 509        .release = generic_pipe_buf_release,
 510        .steal = generic_pipe_buf_steal,
 511        .get = generic_pipe_buf_get,
 512};
 513
 514static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 515                            unsigned long vlen, loff_t offset)
 516{
 517        mm_segment_t old_fs;
 518        loff_t pos = offset;
 519        ssize_t res;
 520
 521        old_fs = get_fs();
 522        set_fs(get_ds());
 523        /* The cast to a user pointer is valid due to the set_fs() */
 524        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
 525        set_fs(old_fs);
 526
 527        return res;
 528}
 529
 530static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
 531                            loff_t pos)
 532{
 533        mm_segment_t old_fs;
 534        ssize_t res;
 535
 536        old_fs = get_fs();
 537        set_fs(get_ds());
 538        /* The cast to a user pointer is valid due to the set_fs() */
 539        res = vfs_write(file, (const char __user *)buf, count, &pos);
 540        set_fs(old_fs);
 541
 542        return res;
 543}
 544
 545ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 546                                 struct pipe_inode_info *pipe, size_t len,
 547                                 unsigned int flags)
 548{
 549        unsigned int nr_pages;
 550        unsigned int nr_freed;
 551        size_t offset;
 552        struct page *pages[PIPE_BUFFERS];
 553        struct partial_page partial[PIPE_BUFFERS];
 554        struct iovec vec[PIPE_BUFFERS];
 555        pgoff_t index;
 556        ssize_t res;
 557        size_t this_len;
 558        int error;
 559        int i;
 560        struct splice_pipe_desc spd = {
 561                .pages = pages,
 562                .partial = partial,
 563                .flags = flags,
 564                .ops = &default_pipe_buf_ops,
 565                .spd_release = spd_release_page,
 566        };
 567
 568        index = *ppos >> PAGE_CACHE_SHIFT;
 569        offset = *ppos & ~PAGE_CACHE_MASK;
 570        nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 571
 572        for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
 573                struct page *page;
 574
 575                page = alloc_page(GFP_USER);
 576                error = -ENOMEM;
 577                if (!page)
 578                        goto err;
 579
 580                this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
 581                vec[i].iov_base = (void __user *) page_address(page);
 582                vec[i].iov_len = this_len;
 583                pages[i] = page;
 584                spd.nr_pages++;
 585                len -= this_len;
 586                offset = 0;
 587        }
 588
 589        res = kernel_readv(in, vec, spd.nr_pages, *ppos);
 590        if (res < 0) {
 591                error = res;
 592                goto err;
 593        }
 594
 595        error = 0;
 596        if (!res)
 597                goto err;
 598
 599        nr_freed = 0;
 600        for (i = 0; i < spd.nr_pages; i++) {
 601                this_len = min_t(size_t, vec[i].iov_len, res);
 602                partial[i].offset = 0;
 603                partial[i].len = this_len;
 604                if (!this_len) {
 605                        __free_page(pages[i]);
 606                        pages[i] = NULL;
 607                        nr_freed++;
 608                }
 609                res -= this_len;
 610        }
 611        spd.nr_pages -= nr_freed;
 612
 613        res = splice_to_pipe(pipe, &spd);
 614        if (res > 0)
 615                *ppos += res;
 616
 617        return res;
 618
 619err:
 620        for (i = 0; i < spd.nr_pages; i++)
 621                __free_page(pages[i]);
 622
 623        return error;
 624}
 625EXPORT_SYMBOL(default_file_splice_read);
 626
 627/*
 628 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 629 * using sendpage(). Return the number of bytes sent.
 630 */
 631static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 632                            struct pipe_buffer *buf, struct splice_desc *sd)
 633{
 634        struct file *file = sd->u.file;
 635        loff_t pos = sd->pos;
 636        int ret, more;
 637
 638        ret = buf->ops->confirm(pipe, buf);
 639        if (!ret) {
 640                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 641                if (file->f_op && file->f_op->sendpage)
 642                        ret = file->f_op->sendpage(file, buf->page, buf->offset,
 643                                                   sd->len, &pos, more);
 644                else
 645                        ret = -EINVAL;
 646        }
 647
 648        return ret;
 649}
 650
 651/*
 652 * This is a little more tricky than the file -> pipe splicing. There are
 653 * basically three cases:
 654 *
 655 *      - Destination page already exists in the address space and there
 656 *        are users of it. For that case we have no other option that
 657 *        copying the data. Tough luck.
 658 *      - Destination page already exists in the address space, but there
 659 *        are no users of it. Make sure it's uptodate, then drop it. Fall
 660 *        through to last case.
 661 *      - Destination page does not exist, we can add the pipe page to
 662 *        the page cache and avoid the copy.
 663 *
 664 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
 665 * sd->flags), we attempt to migrate pages from the pipe to the output
 666 * file address space page cache. This is possible if no one else has
 667 * the pipe page referenced outside of the pipe and page cache. If
 668 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
 669 * a new page in the output file page cache and fill/dirty that.
 670 */
 671int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 672                 struct splice_desc *sd)
 673{
 674        struct file *file = sd->u.file;
 675        struct address_space *mapping = file->f_mapping;
 676        unsigned int offset, this_len;
 677        struct page *page;
 678        void *fsdata;
 679        int ret;
 680
 681        /*
 682         * make sure the data in this buffer is uptodate
 683         */
 684        ret = buf->ops->confirm(pipe, buf);
 685        if (unlikely(ret))
 686                return ret;
 687
 688        offset = sd->pos & ~PAGE_CACHE_MASK;
 689
 690        this_len = sd->len;
 691        if (this_len + offset > PAGE_CACHE_SIZE)
 692                this_len = PAGE_CACHE_SIZE - offset;
 693
 694        ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
 695                                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 696        if (unlikely(ret))
 697                goto out;
 698
 699        if (buf->page != page) {
 700                /*
 701                 * Careful, ->map() uses KM_USER0!
 702                 */
 703                char *src = buf->ops->map(pipe, buf, 1);
 704                char *dst = kmap_atomic(page, KM_USER1);
 705
 706                memcpy(dst + offset, src + buf->offset, this_len);
 707                flush_dcache_page(page);
 708                kunmap_atomic(dst, KM_USER1);
 709                buf->ops->unmap(pipe, buf, src);
 710        }
 711        ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
 712                                page, fsdata);
 713out:
 714        return ret;
 715}
 716EXPORT_SYMBOL(pipe_to_file);
 717
 718static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 719{
 720        smp_mb();
 721        if (waitqueue_active(&pipe->wait))
 722                wake_up_interruptible(&pipe->wait);
 723        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 724}
 725
 726/**
 727 * splice_from_pipe_feed - feed available data from a pipe to a file
 728 * @pipe:       pipe to splice from
 729 * @sd:         information to @actor
 730 * @actor:      handler that splices the data
 731 *
 732 * Description:
 733 *    This function loops over the pipe and calls @actor to do the
 734 *    actual moving of a single struct pipe_buffer to the desired
 735 *    destination.  It returns when there's no more buffers left in
 736 *    the pipe or if the requested number of bytes (@sd->total_len)
 737 *    have been copied.  It returns a positive number (one) if the
 738 *    pipe needs to be filled with more data, zero if the required
 739 *    number of bytes have been copied and -errno on error.
 740 *
 741 *    This, together with splice_from_pipe_{begin,end,next}, may be
 742 *    used to implement the functionality of __splice_from_pipe() when
 743 *    locking is required around copying the pipe buffers to the
 744 *    destination.
 745 */
 746int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 747                          splice_actor *actor)
 748{
 749        int ret;
 750
 751        while (pipe->nrbufs) {
 752                struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 753                const struct pipe_buf_operations *ops = buf->ops;
 754
 755                sd->len = buf->len;
 756                if (sd->len > sd->total_len)
 757                        sd->len = sd->total_len;
 758
 759                ret = actor(pipe, buf, sd);
 760                if (ret <= 0) {
 761                        if (ret == -ENODATA)
 762                                ret = 0;
 763                        return ret;
 764                }
 765                buf->offset += ret;
 766                buf->len -= ret;
 767
 768                sd->num_spliced += ret;
 769                sd->len -= ret;
 770                sd->pos += ret;
 771                sd->total_len -= ret;
 772
 773                if (!buf->len) {
 774                        buf->ops = NULL;
 775                        ops->release(pipe, buf);
 776                        pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
 777                        pipe->nrbufs--;
 778                        if (pipe->inode)
 779                                sd->need_wakeup = true;
 780                }
 781
 782                if (!sd->total_len)
 783                        return 0;
 784        }
 785
 786        return 1;
 787}
 788EXPORT_SYMBOL(splice_from_pipe_feed);
 789
 790/**
 791 * splice_from_pipe_next - wait for some data to splice from
 792 * @pipe:       pipe to splice from
 793 * @sd:         information about the splice operation
 794 *
 795 * Description:
 796 *    This function will wait for some data and return a positive
 797 *    value (one) if pipe buffers are available.  It will return zero
 798 *    or -errno if no more data needs to be spliced.
 799 */
 800int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 801{
 802        while (!pipe->nrbufs) {
 803                if (!pipe->writers)
 804                        return 0;
 805
 806                if (!pipe->waiting_writers && sd->num_spliced)
 807                        return 0;
 808
 809                if (sd->flags & SPLICE_F_NONBLOCK)
 810                        return -EAGAIN;
 811
 812                if (signal_pending(current))
 813                        return -ERESTARTSYS;
 814
 815                if (sd->need_wakeup) {
 816                        wakeup_pipe_writers(pipe);
 817                        sd->need_wakeup = false;
 818                }
 819
 820                pipe_wait(pipe);
 821        }
 822
 823        return 1;
 824}
 825EXPORT_SYMBOL(splice_from_pipe_next);
 826
 827/**
 828 * splice_from_pipe_begin - start splicing from pipe
 829 * @sd:         information about the splice operation
 830 *
 831 * Description:
 832 *    This function should be called before a loop containing
 833 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 834 *    initialize the necessary fields of @sd.
 835 */
 836void splice_from_pipe_begin(struct splice_desc *sd)
 837{
 838        sd->num_spliced = 0;
 839        sd->need_wakeup = false;
 840}
 841EXPORT_SYMBOL(splice_from_pipe_begin);
 842
 843/**
 844 * splice_from_pipe_end - finish splicing from pipe
 845 * @pipe:       pipe to splice from
 846 * @sd:         information about the splice operation
 847 *
 848 * Description:
 849 *    This function will wake up pipe writers if necessary.  It should
 850 *    be called after a loop containing splice_from_pipe_next() and
 851 *    splice_from_pipe_feed().
 852 */
 853void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 854{
 855        if (sd->need_wakeup)
 856                wakeup_pipe_writers(pipe);
 857}
 858EXPORT_SYMBOL(splice_from_pipe_end);
 859
 860/**
 861 * __splice_from_pipe - splice data from a pipe to given actor
 862 * @pipe:       pipe to splice from
 863 * @sd:         information to @actor
 864 * @actor:      handler that splices the data
 865 *
 866 * Description:
 867 *    This function does little more than loop over the pipe and call
 868 *    @actor to do the actual moving of a single struct pipe_buffer to
 869 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 870 *    pipe_to_user.
 871 *
 872 */
 873ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 874                           splice_actor *actor)
 875{
 876        int ret;
 877
 878        splice_from_pipe_begin(sd);
 879        do {
 880                ret = splice_from_pipe_next(pipe, sd);
 881                if (ret > 0)
 882                        ret = splice_from_pipe_feed(pipe, sd, actor);
 883        } while (ret > 0);
 884        splice_from_pipe_end(pipe, sd);
 885
 886        return sd->num_spliced ? sd->num_spliced : ret;
 887}
 888EXPORT_SYMBOL(__splice_from_pipe);
 889
 890/**
 891 * splice_from_pipe - splice data from a pipe to a file
 892 * @pipe:       pipe to splice from
 893 * @out:        file to splice to
 894 * @ppos:       position in @out
 895 * @len:        how many bytes to splice
 896 * @flags:      splice modifier flags
 897 * @actor:      handler that splices the data
 898 *
 899 * Description:
 900 *    See __splice_from_pipe. This function locks the pipe inode,
 901 *    otherwise it's identical to __splice_from_pipe().
 902 *
 903 */
 904ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 905                         loff_t *ppos, size_t len, unsigned int flags,
 906                         splice_actor *actor)
 907{
 908        ssize_t ret;
 909        struct splice_desc sd = {
 910                .total_len = len,
 911                .flags = flags,
 912                .pos = *ppos,
 913                .u.file = out,
 914        };
 915
 916        pipe_lock(pipe);
 917        ret = __splice_from_pipe(pipe, &sd, actor);
 918        pipe_unlock(pipe);
 919
 920        return ret;
 921}
 922
 923/**
 924 * generic_file_splice_write - splice data from a pipe to a file
 925 * @pipe:       pipe info
 926 * @out:        file to write to
 927 * @ppos:       position in @out
 928 * @len:        number of bytes to splice
 929 * @flags:      splice modifier flags
 930 *
 931 * Description:
 932 *    Will either move or copy pages (determined by @flags options) from
 933 *    the given pipe inode to the given file.
 934 *
 935 */
 936ssize_t
 937generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 938                          loff_t *ppos, size_t len, unsigned int flags)
 939{
 940        struct address_space *mapping = out->f_mapping;
 941        struct inode *inode = mapping->host;
 942        struct splice_desc sd = {
 943                .total_len = len,
 944                .flags = flags,
 945                .pos = *ppos,
 946                .u.file = out,
 947        };
 948        ssize_t ret;
 949
 950        pipe_lock(pipe);
 951
 952        splice_from_pipe_begin(&sd);
 953        do {
 954                ret = splice_from_pipe_next(pipe, &sd);
 955                if (ret <= 0)
 956                        break;
 957
 958                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 959                ret = file_remove_suid(out);
 960                if (!ret) {
 961                        file_update_time(out);
 962                        ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
 963                }
 964                mutex_unlock(&inode->i_mutex);
 965        } while (ret > 0);
 966        splice_from_pipe_end(pipe, &sd);
 967
 968        pipe_unlock(pipe);
 969
 970        if (sd.num_spliced)
 971                ret = sd.num_spliced;
 972
 973        if (ret > 0) {
 974                unsigned long nr_pages;
 975                int err;
 976
 977                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 978
 979                err = generic_write_sync(out, *ppos, ret);
 980                if (err)
 981                        ret = err;
 982                else
 983                        *ppos += ret;
 984                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 985        }
 986
 987        return ret;
 988}
 989
 990EXPORT_SYMBOL(generic_file_splice_write);
 991
 992static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 993                          struct splice_desc *sd)
 994{
 995        int ret;
 996        void *data;
 997
 998        ret = buf->ops->confirm(pipe, buf);
 999        if (ret)
1000                return ret;
1001
1002        data = buf->ops->map(pipe, buf, 0);
1003        ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1004        buf->ops->unmap(pipe, buf, data);
1005
1006        return ret;
1007}
1008
1009static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
1010                                         struct file *out, loff_t *ppos,
1011                                         size_t len, unsigned int flags)
1012{
1013        ssize_t ret;
1014
1015        ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
1016        if (ret > 0)
1017                *ppos += ret;
1018
1019        return ret;
1020}
1021
1022/**
1023 * generic_splice_sendpage - splice data from a pipe to a socket
1024 * @pipe:       pipe to splice from
1025 * @out:        socket to write to
1026 * @ppos:       position in @out
1027 * @len:        number of bytes to splice
1028 * @flags:      splice modifier flags
1029 *
1030 * Description:
1031 *    Will send @len bytes from the pipe to a network socket. No data copying
1032 *    is involved.
1033 *
1034 */
1035ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
1036                                loff_t *ppos, size_t len, unsigned int flags)
1037{
1038        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
1039}
1040
1041EXPORT_SYMBOL(generic_splice_sendpage);
1042
1043/*
1044 * Attempt to initiate a splice from pipe to file.
1045 */
1046static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1047                           loff_t *ppos, size_t len, unsigned int flags)
1048{
1049        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1050                                loff_t *, size_t, unsigned int);
1051        int ret;
1052
1053        if (unlikely(!(out->f_mode & FMODE_WRITE)))
1054                return -EBADF;
1055
1056        if (unlikely(out->f_flags & O_APPEND))
1057                return -EINVAL;
1058
1059        ret = rw_verify_area(WRITE, out, ppos, len);
1060        if (unlikely(ret < 0))
1061                return ret;
1062
1063        if (out->f_op && out->f_op->splice_write)
1064                splice_write = out->f_op->splice_write;
1065        else
1066                splice_write = default_file_splice_write;
1067
1068        return splice_write(pipe, out, ppos, len, flags);
1069}
1070
1071/*
1072 * Attempt to initiate a splice from a file to a pipe.
1073 */
1074static long do_splice_to(struct file *in, loff_t *ppos,
1075                         struct pipe_inode_info *pipe, size_t len,
1076                         unsigned int flags)
1077{
1078        ssize_t (*splice_read)(struct file *, loff_t *,
1079                               struct pipe_inode_info *, size_t, unsigned int);
1080        int ret;
1081
1082        if (unlikely(!(in->f_mode & FMODE_READ)))
1083                return -EBADF;
1084
1085        ret = rw_verify_area(READ, in, ppos, len);
1086        if (unlikely(ret < 0))
1087                return ret;
1088
1089        if (in->f_op && in->f_op->splice_read)
1090                splice_read = in->f_op->splice_read;
1091        else
1092                splice_read = default_file_splice_read;
1093
1094        return splice_read(in, ppos, pipe, len, flags);
1095}
1096
1097/**
1098 * splice_direct_to_actor - splices data directly between two non-pipes
1099 * @in:         file to splice from
1100 * @sd:         actor information on where to splice to
1101 * @actor:      handles the data splicing
1102 *
1103 * Description:
1104 *    This is a special case helper to splice directly between two
1105 *    points, without requiring an explicit pipe. Internally an allocated
1106 *    pipe is cached in the process, and reused during the lifetime of
1107 *    that process.
1108 *
1109 */
1110ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1111                               splice_direct_actor *actor)
1112{
1113        struct pipe_inode_info *pipe;
1114        long ret, bytes;
1115        umode_t i_mode;
1116        size_t len;
1117        int i, flags;
1118
1119        /*
1120         * We require the input being a regular file, as we don't want to
1121         * randomly drop data for eg socket -> socket splicing. Use the
1122         * piped splicing for that!
1123         */
1124        i_mode = in->f_path.dentry->d_inode->i_mode;
1125        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1126                return -EINVAL;
1127
1128        /*
1129         * neither in nor out is a pipe, setup an internal pipe attached to
1130         * 'out' and transfer the wanted data from 'in' to 'out' through that
1131         */
1132        pipe = current->splice_pipe;
1133        if (unlikely(!pipe)) {
1134                pipe = alloc_pipe_info(NULL);
1135                if (!pipe)
1136                        return -ENOMEM;
1137
1138                /*
1139                 * We don't have an immediate reader, but we'll read the stuff
1140                 * out of the pipe right after the splice_to_pipe(). So set
1141                 * PIPE_READERS appropriately.
1142                 */
1143                pipe->readers = 1;
1144
1145                current->splice_pipe = pipe;
1146        }
1147
1148        /*
1149         * Do the splice.
1150         */
1151        ret = 0;
1152        bytes = 0;
1153        len = sd->total_len;
1154        flags = sd->flags;
1155
1156        /*
1157         * Don't block on output, we have to drain the direct pipe.
1158         */
1159        sd->flags &= ~SPLICE_F_NONBLOCK;
1160
1161        while (len) {
1162                size_t read_len;
1163                loff_t pos = sd->pos, prev_pos = pos;
1164
1165                ret = do_splice_to(in, &pos, pipe, len, flags);
1166                if (unlikely(ret <= 0))
1167                        goto out_release;
1168
1169                read_len = ret;
1170                sd->total_len = read_len;
1171
1172                /*
1173                 * NOTE: nonblocking mode only applies to the input. We
1174                 * must not do the output in nonblocking mode as then we
1175                 * could get stuck data in the internal pipe:
1176                 */
1177                ret = actor(pipe, sd);
1178                if (unlikely(ret <= 0)) {
1179                        sd->pos = prev_pos;
1180                        goto out_release;
1181                }
1182
1183                bytes += ret;
1184                len -= ret;
1185                sd->pos = pos;
1186
1187                if (ret < read_len) {
1188                        sd->pos = prev_pos + ret;
1189                        goto out_release;
1190                }
1191        }
1192
1193done:
1194        pipe->nrbufs = pipe->curbuf = 0;
1195        file_accessed(in);
1196        return bytes;
1197
1198out_release:
1199        /*
1200         * If we did an incomplete transfer we must release
1201         * the pipe buffers in question:
1202         */
1203        for (i = 0; i < PIPE_BUFFERS; i++) {
1204                struct pipe_buffer *buf = pipe->bufs + i;
1205
1206                if (buf->ops) {
1207                        buf->ops->release(pipe, buf);
1208                        buf->ops = NULL;
1209                }
1210        }
1211
1212        if (!bytes)
1213                bytes = ret;
1214
1215        goto done;
1216}
1217EXPORT_SYMBOL(splice_direct_to_actor);
1218
1219static int direct_splice_actor(struct pipe_inode_info *pipe,
1220                               struct splice_desc *sd)
1221{
1222        struct file *file = sd->u.file;
1223
1224        return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
1225                              sd->flags);
1226}
1227
1228/**
1229 * do_splice_direct - splices data directly between two files
1230 * @in:         file to splice from
1231 * @ppos:       input file offset
1232 * @out:        file to splice to
1233 * @len:        number of bytes to splice
1234 * @flags:      splice modifier flags
1235 *
1236 * Description:
1237 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1238 *    doing it in the application would incur an extra system call
1239 *    (splice in + splice out, as compared to just sendfile()). So this helper
1240 *    can splice directly through a process-private pipe.
1241 *
1242 */
1243long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1244                      size_t len, unsigned int flags)
1245{
1246        struct splice_desc sd = {
1247                .len            = len,
1248                .total_len      = len,
1249                .flags          = flags,
1250                .pos            = *ppos,
1251                .u.file         = out,
1252        };
1253        long ret;
1254
1255        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1256        if (ret > 0)
1257                *ppos = sd.pos;
1258
1259        return ret;
1260}
1261
1262static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1263                               struct pipe_inode_info *opipe,
1264                               size_t len, unsigned int flags);
1265/*
1266 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1267 * location, so checking ->i_pipe is not enough to verify that this is a
1268 * pipe.
1269 */
1270static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1271{
1272        if (S_ISFIFO(inode->i_mode))
1273                return inode->i_pipe;
1274
1275        return NULL;
1276}
1277
1278/*
1279 * Determine where to splice to/from.
1280 */
1281static long do_splice(struct file *in, loff_t __user *off_in,
1282                      struct file *out, loff_t __user *off_out,
1283                      size_t len, unsigned int flags)
1284{
1285        struct pipe_inode_info *ipipe;
1286        struct pipe_inode_info *opipe;
1287        loff_t offset, *off;
1288        long ret;
1289
1290        ipipe = pipe_info(in->f_path.dentry->d_inode);
1291        opipe = pipe_info(out->f_path.dentry->d_inode);
1292
1293        if (ipipe && opipe) {
1294                if (off_in || off_out)
1295                        return -ESPIPE;
1296
1297                if (!(in->f_mode & FMODE_READ))
1298                        return -EBADF;
1299
1300                if (!(out->f_mode & FMODE_WRITE))
1301                        return -EBADF;
1302
1303                /* Splicing to self would be fun, but... */
1304                if (ipipe == opipe)
1305                        return -EINVAL;
1306
1307                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1308        }
1309
1310        if (ipipe) {
1311                if (off_in)
1312                        return -ESPIPE;
1313                if (off_out) {
1314                        if (!out->f_op || !out->f_op->llseek ||
1315                            out->f_op->llseek == no_llseek)
1316                                return -EINVAL;
1317                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1318                                return -EFAULT;
1319                        off = &offset;
1320                } else
1321                        off = &out->f_pos;
1322
1323                ret = do_splice_from(ipipe, out, off, len, flags);
1324
1325                if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1326                        ret = -EFAULT;
1327
1328                return ret;
1329        }
1330
1331        if (opipe) {
1332                if (off_out)
1333                        return -ESPIPE;
1334                if (off_in) {
1335                        if (!in->f_op || !in->f_op->llseek ||
1336                            in->f_op->llseek == no_llseek)
1337                                return -EINVAL;
1338                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1339                                return -EFAULT;
1340                        off = &offset;
1341                } else
1342                        off = &in->f_pos;
1343
1344                ret = do_splice_to(in, off, opipe, len, flags);
1345
1346                if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1347                        ret = -EFAULT;
1348
1349                return ret;
1350        }
1351
1352        return -EINVAL;
1353}
1354
1355/*
1356 * Map an iov into an array of pages and offset/length tupples. With the
1357 * partial_page structure, we can map several non-contiguous ranges into
1358 * our ones pages[] map instead of splitting that operation into pieces.
1359 * Could easily be exported as a generic helper for other users, in which
1360 * case one would probably want to add a 'max_nr_pages' parameter as well.
1361 */
1362static int get_iovec_page_array(const struct iovec __user *iov,
1363                                unsigned int nr_vecs, struct page **pages,
1364                                struct partial_page *partial, int aligned)
1365{
1366        int buffers = 0, error = 0;
1367
1368        while (nr_vecs) {
1369                unsigned long off, npages;
1370                struct iovec entry;
1371                void __user *base;
1372                size_t len;
1373                int i;
1374
1375                error = -EFAULT;
1376                if (copy_from_user(&entry, iov, sizeof(entry)))
1377                        break;
1378
1379                base = entry.iov_base;
1380                len = entry.iov_len;
1381
1382                /*
1383                 * Sanity check this iovec. 0 read succeeds.
1384                 */
1385                error = 0;
1386                if (unlikely(!len))
1387                        break;
1388                error = -EFAULT;
1389                if (!access_ok(VERIFY_READ, base, len))
1390                        break;
1391
1392                /*
1393                 * Get this base offset and number of pages, then map
1394                 * in the user pages.
1395                 */
1396                off = (unsigned long) base & ~PAGE_MASK;
1397
1398                /*
1399                 * If asked for alignment, the offset must be zero and the
1400                 * length a multiple of the PAGE_SIZE.
1401                 */
1402                error = -EINVAL;
1403                if (aligned && (off || len & ~PAGE_MASK))
1404                        break;
1405
1406                npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1407                if (npages > PIPE_BUFFERS - buffers)
1408                        npages = PIPE_BUFFERS - buffers;
1409
1410                error = get_user_pages_fast((unsigned long)base, npages,
1411                                        0, &pages[buffers]);
1412
1413                if (unlikely(error <= 0))
1414                        break;
1415
1416                /*
1417                 * Fill this contiguous range into the partial page map.
1418                 */
1419                for (i = 0; i < error; i++) {
1420                        const int plen = min_t(size_t, len, PAGE_SIZE - off);
1421
1422                        partial[buffers].offset = off;
1423                        partial[buffers].len = plen;
1424
1425                        off = 0;
1426                        len -= plen;
1427                        buffers++;
1428                }
1429
1430                /*
1431                 * We didn't complete this iov, stop here since it probably
1432                 * means we have to move some of this into a pipe to
1433                 * be able to continue.
1434                 */
1435                if (len)
1436                        break;
1437
1438                /*
1439                 * Don't continue if we mapped fewer pages than we asked for,
1440                 * or if we mapped the max number of pages that we have
1441                 * room for.
1442                 */
1443                if (error < npages || buffers == PIPE_BUFFERS)
1444                        break;
1445
1446                nr_vecs--;
1447                iov++;
1448        }
1449
1450        if (buffers)
1451                return buffers;
1452
1453        return error;
1454}
1455
1456static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1457                        struct splice_desc *sd)
1458{
1459        char *src;
1460        int ret;
1461
1462        ret = buf->ops->confirm(pipe, buf);
1463        if (unlikely(ret))
1464                return ret;
1465
1466        /*
1467         * See if we can use the atomic maps, by prefaulting in the
1468         * pages and doing an atomic copy
1469         */
1470        if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1471                src = buf->ops->map(pipe, buf, 1);
1472                ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1473                                                        sd->len);
1474                buf->ops->unmap(pipe, buf, src);
1475                if (!ret) {
1476                        ret = sd->len;
1477                        goto out;
1478                }
1479        }
1480
1481        /*
1482         * No dice, use slow non-atomic map and copy
1483         */
1484        src = buf->ops->map(pipe, buf, 0);
1485
1486        ret = sd->len;
1487        if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1488                ret = -EFAULT;
1489
1490        buf->ops->unmap(pipe, buf, src);
1491out:
1492        if (ret > 0)
1493                sd->u.userptr += ret;
1494        return ret;
1495}
1496
1497/*
1498 * For lack of a better implementation, implement vmsplice() to userspace
1499 * as a simple copy of the pipes pages to the user iov.
1500 */
1501static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1502                             unsigned long nr_segs, unsigned int flags)
1503{
1504        struct pipe_inode_info *pipe;
1505        struct splice_desc sd;
1506        ssize_t size;
1507        int error;
1508        long ret;
1509
1510        pipe = pipe_info(file->f_path.dentry->d_inode);
1511        if (!pipe)
1512                return -EBADF;
1513
1514        pipe_lock(pipe);
1515
1516        error = ret = 0;
1517        while (nr_segs) {
1518                void __user *base;
1519                size_t len;
1520
1521                /*
1522                 * Get user address base and length for this iovec.
1523                 */
1524                error = get_user(base, &iov->iov_base);
1525                if (unlikely(error))
1526                        break;
1527                error = get_user(len, &iov->iov_len);
1528                if (unlikely(error))
1529                        break;
1530
1531                /*
1532                 * Sanity check this iovec. 0 read succeeds.
1533                 */
1534                if (unlikely(!len))
1535                        break;
1536                if (unlikely(!base)) {
1537                        error = -EFAULT;
1538                        break;
1539                }
1540
1541                if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1542                        error = -EFAULT;
1543                        break;
1544                }
1545
1546                sd.len = 0;
1547                sd.total_len = len;
1548                sd.flags = flags;
1549                sd.u.userptr = base;
1550                sd.pos = 0;
1551
1552                size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1553                if (size < 0) {
1554                        if (!ret)
1555                                ret = size;
1556
1557                        break;
1558                }
1559
1560                ret += size;
1561
1562                if (size < len)
1563                        break;
1564
1565                nr_segs--;
1566                iov++;
1567        }
1568
1569        pipe_unlock(pipe);
1570
1571        if (!ret)
1572                ret = error;
1573
1574        return ret;
1575}
1576
1577/*
1578 * vmsplice splices a user address range into a pipe. It can be thought of
1579 * as splice-from-memory, where the regular splice is splice-from-file (or
1580 * to file). In both cases the output is a pipe, naturally.
1581 */
1582static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1583                             unsigned long nr_segs, unsigned int flags)
1584{
1585        struct pipe_inode_info *pipe;
1586        struct page *pages[PIPE_BUFFERS];
1587        struct partial_page partial[PIPE_BUFFERS];
1588        struct splice_pipe_desc spd = {
1589                .pages = pages,
1590                .partial = partial,
1591                .flags = flags,
1592                .ops = &user_page_pipe_buf_ops,
1593                .spd_release = spd_release_page,
1594        };
1595
1596        pipe = pipe_info(file->f_path.dentry->d_inode);
1597        if (!pipe)
1598                return -EBADF;
1599
1600        spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1601                                            flags & SPLICE_F_GIFT);
1602        if (spd.nr_pages <= 0)
1603                return spd.nr_pages;
1604
1605        return splice_to_pipe(pipe, &spd);
1606}
1607
1608/*
1609 * Note that vmsplice only really supports true splicing _from_ user memory
1610 * to a pipe, not the other way around. Splicing from user memory is a simple
1611 * operation that can be supported without any funky alignment restrictions
1612 * or nasty vm tricks. We simply map in the user memory and fill them into
1613 * a pipe. The reverse isn't quite as easy, though. There are two possible
1614 * solutions for that:
1615 *
1616 *      - memcpy() the data internally, at which point we might as well just
1617 *        do a regular read() on the buffer anyway.
1618 *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1619 *        has restriction limitations on both ends of the pipe).
1620 *
1621 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1622 *
1623 */
1624SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1625                unsigned long, nr_segs, unsigned int, flags)
1626{
1627        struct file *file;
1628        long error;
1629        int fput;
1630
1631        if (unlikely(nr_segs > UIO_MAXIOV))
1632                return -EINVAL;
1633        else if (unlikely(!nr_segs))
1634                return 0;
1635
1636        error = -EBADF;
1637        file = fget_light(fd, &fput);
1638        if (file) {
1639                if (file->f_mode & FMODE_WRITE)
1640                        error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1641                else if (file->f_mode & FMODE_READ)
1642                        error = vmsplice_to_user(file, iov, nr_segs, flags);
1643
1644                fput_light(file, fput);
1645        }
1646
1647        return error;
1648}
1649
1650SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1651                int, fd_out, loff_t __user *, off_out,
1652                size_t, len, unsigned int, flags)
1653{
1654        long error;
1655        struct file *in, *out;
1656        int fput_in, fput_out;
1657
1658        if (unlikely(!len))
1659                return 0;
1660
1661        error = -EBADF;
1662        in = fget_light(fd_in, &fput_in);
1663        if (in) {
1664                if (in->f_mode & FMODE_READ) {
1665                        out = fget_light(fd_out, &fput_out);
1666                        if (out) {
1667                                if (out->f_mode & FMODE_WRITE)
1668                                        error = do_splice(in, off_in,
1669                                                          out, off_out,
1670                                                          len, flags);
1671                                fput_light(out, fput_out);
1672                        }
1673                }
1674
1675                fput_light(in, fput_in);
1676        }
1677
1678        return error;
1679}
1680
1681/*
1682 * Make sure there's data to read. Wait for input if we can, otherwise
1683 * return an appropriate error.
1684 */
1685static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1686{
1687        int ret;
1688
1689        /*
1690         * Check ->nrbufs without the inode lock first. This function
1691         * is speculative anyways, so missing one is ok.
1692         */
1693        if (pipe->nrbufs)
1694                return 0;
1695
1696        ret = 0;
1697        pipe_lock(pipe);
1698
1699        while (!pipe->nrbufs) {
1700                if (signal_pending(current)) {
1701                        ret = -ERESTARTSYS;
1702                        break;
1703                }
1704                if (!pipe->writers)
1705                        break;
1706                if (!pipe->waiting_writers) {
1707                        if (flags & SPLICE_F_NONBLOCK) {
1708                                ret = -EAGAIN;
1709                                break;
1710                        }
1711                }
1712                pipe_wait(pipe);
1713        }
1714
1715        pipe_unlock(pipe);
1716        return ret;
1717}
1718
1719/*
1720 * Make sure there's writeable room. Wait for room if we can, otherwise
1721 * return an appropriate error.
1722 */
1723static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1724{
1725        int ret;
1726
1727        /*
1728         * Check ->nrbufs without the inode lock first. This function
1729         * is speculative anyways, so missing one is ok.
1730         */
1731        if (pipe->nrbufs < PIPE_BUFFERS)
1732                return 0;
1733
1734        ret = 0;
1735        pipe_lock(pipe);
1736
1737        while (pipe->nrbufs >= PIPE_BUFFERS) {
1738                if (!pipe->readers) {
1739                        send_sig(SIGPIPE, current, 0);
1740                        ret = -EPIPE;
1741                        break;
1742                }
1743                if (flags & SPLICE_F_NONBLOCK) {
1744                        ret = -EAGAIN;
1745                        break;
1746                }
1747                if (signal_pending(current)) {
1748                        ret = -ERESTARTSYS;
1749                        break;
1750                }
1751                pipe->waiting_writers++;
1752                pipe_wait(pipe);
1753                pipe->waiting_writers--;
1754        }
1755
1756        pipe_unlock(pipe);
1757        return ret;
1758}
1759
1760/*
1761 * Splice contents of ipipe to opipe.
1762 */
1763static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1764                               struct pipe_inode_info *opipe,
1765                               size_t len, unsigned int flags)
1766{
1767        struct pipe_buffer *ibuf, *obuf;
1768        int ret = 0, nbuf;
1769        bool input_wakeup = false;
1770
1771
1772retry:
1773        ret = ipipe_prep(ipipe, flags);
1774        if (ret)
1775                return ret;
1776
1777        ret = opipe_prep(opipe, flags);
1778        if (ret)
1779                return ret;
1780
1781        /*
1782         * Potential ABBA deadlock, work around it by ordering lock
1783         * grabbing by pipe info address. Otherwise two different processes
1784         * could deadlock (one doing tee from A -> B, the other from B -> A).
1785         */
1786        pipe_double_lock(ipipe, opipe);
1787
1788        do {
1789                if (!opipe->readers) {
1790                        send_sig(SIGPIPE, current, 0);
1791                        if (!ret)
1792                                ret = -EPIPE;
1793                        break;
1794                }
1795
1796                if (!ipipe->nrbufs && !ipipe->writers)
1797                        break;
1798
1799                /*
1800                 * Cannot make any progress, because either the input
1801                 * pipe is empty or the output pipe is full.
1802                 */
1803                if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
1804                        /* Already processed some buffers, break */
1805                        if (ret)
1806                                break;
1807
1808                        if (flags & SPLICE_F_NONBLOCK) {
1809                                ret = -EAGAIN;
1810                                break;
1811                        }
1812
1813                        /*
1814                         * We raced with another reader/writer and haven't
1815                         * managed to process any buffers.  A zero return
1816                         * value means EOF, so retry instead.
1817                         */
1818                        pipe_unlock(ipipe);
1819                        pipe_unlock(opipe);
1820                        goto retry;
1821                }
1822
1823                ibuf = ipipe->bufs + ipipe->curbuf;
1824                nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
1825                obuf = opipe->bufs + nbuf;
1826
1827                if (len >= ibuf->len) {
1828                        /*
1829                         * Simply move the whole buffer from ipipe to opipe
1830                         */
1831                        *obuf = *ibuf;
1832                        ibuf->ops = NULL;
1833                        opipe->nrbufs++;
1834                        ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
1835                        ipipe->nrbufs--;
1836                        input_wakeup = true;
1837                } else {
1838                        /*
1839                         * Get a reference to this pipe buffer,
1840                         * so we can copy the contents over.
1841                         */
1842                        ibuf->ops->get(ipipe, ibuf);
1843                        *obuf = *ibuf;
1844
1845                        /*
1846                         * Don't inherit the gift flag, we need to
1847                         * prevent multiple steals of this page.
1848                         */
1849                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1850
1851                        obuf->len = len;
1852                        opipe->nrbufs++;
1853                        ibuf->offset += obuf->len;
1854                        ibuf->len -= obuf->len;
1855                }
1856                ret += obuf->len;
1857                len -= obuf->len;
1858        } while (len);
1859
1860        pipe_unlock(ipipe);
1861        pipe_unlock(opipe);
1862
1863        /*
1864         * If we put data in the output pipe, wakeup any potential readers.
1865         */
1866        if (ret > 0) {
1867                smp_mb();
1868                if (waitqueue_active(&opipe->wait))
1869                        wake_up_interruptible(&opipe->wait);
1870                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1871        }
1872        if (input_wakeup)
1873                wakeup_pipe_writers(ipipe);
1874
1875        return ret;
1876}
1877
1878/*
1879 * Link contents of ipipe to opipe.
1880 */
1881static int link_pipe(struct pipe_inode_info *ipipe,
1882                     struct pipe_inode_info *opipe,
1883                     size_t len, unsigned int flags)
1884{
1885        struct pipe_buffer *ibuf, *obuf;
1886        int ret = 0, i = 0, nbuf;
1887
1888        /*
1889         * Potential ABBA deadlock, work around it by ordering lock
1890         * grabbing by pipe info address. Otherwise two different processes
1891         * could deadlock (one doing tee from A -> B, the other from B -> A).
1892         */
1893        pipe_double_lock(ipipe, opipe);
1894
1895        do {
1896                if (!opipe->readers) {
1897                        send_sig(SIGPIPE, current, 0);
1898                        if (!ret)
1899                                ret = -EPIPE;
1900                        break;
1901                }
1902
1903                /*
1904                 * If we have iterated all input buffers or ran out of
1905                 * output room, break.
1906                 */
1907                if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1908                        break;
1909
1910                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1911                nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1912
1913                /*
1914                 * Get a reference to this pipe buffer,
1915                 * so we can copy the contents over.
1916                 */
1917                ibuf->ops->get(ipipe, ibuf);
1918
1919                obuf = opipe->bufs + nbuf;
1920                *obuf = *ibuf;
1921
1922                /*
1923                 * Don't inherit the gift flag, we need to
1924                 * prevent multiple steals of this page.
1925                 */
1926                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1927
1928                if (obuf->len > len)
1929                        obuf->len = len;
1930
1931                opipe->nrbufs++;
1932                ret += obuf->len;
1933                len -= obuf->len;
1934                i++;
1935        } while (len);
1936
1937        /*
1938         * return EAGAIN if we have the potential of some data in the
1939         * future, otherwise just return 0
1940         */
1941        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1942                ret = -EAGAIN;
1943
1944        pipe_unlock(ipipe);
1945        pipe_unlock(opipe);
1946
1947        /*
1948         * If we put data in the output pipe, wakeup any potential readers.
1949         */
1950        if (ret > 0) {
1951                smp_mb();
1952                if (waitqueue_active(&opipe->wait))
1953                        wake_up_interruptible(&opipe->wait);
1954                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1955        }
1956
1957        return ret;
1958}
1959
1960/*
1961 * This is a tee(1) implementation that works on pipes. It doesn't copy
1962 * any data, it simply references the 'in' pages on the 'out' pipe.
1963 * The 'flags' used are the SPLICE_F_* variants, currently the only
1964 * applicable one is SPLICE_F_NONBLOCK.
1965 */
1966static long do_tee(struct file *in, struct file *out, size_t len,
1967                   unsigned int flags)
1968{
1969        struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1970        struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1971        int ret = -EINVAL;
1972
1973        /*
1974         * Duplicate the contents of ipipe to opipe without actually
1975         * copying the data.
1976         */
1977        if (ipipe && opipe && ipipe != opipe) {
1978                /*
1979                 * Keep going, unless we encounter an error. The ipipe/opipe
1980                 * ordering doesn't really matter.
1981                 */
1982                ret = ipipe_prep(ipipe, flags);
1983                if (!ret) {
1984                        ret = opipe_prep(opipe, flags);
1985                        if (!ret)
1986                                ret = link_pipe(ipipe, opipe, len, flags);
1987                }
1988        }
1989
1990        return ret;
1991}
1992
1993SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1994{
1995        struct file *in;
1996        int error, fput_in;
1997
1998        if (unlikely(!len))
1999                return 0;
2000
2001        error = -EBADF;
2002        in = fget_light(fdin, &fput_in);
2003        if (in) {
2004                if (in->f_mode & FMODE_READ) {
2005                        int fput_out;
2006                        struct file *out = fget_light(fdout, &fput_out);
2007
2008                        if (out) {
2009                                if (out->f_mode & FMODE_WRITE)
2010                                        error = do_tee(in, out, len, flags);
2011                                fput_light(out, fput_out);
2012                        }
2013                }
2014                fput_light(in, fput_in);
2015        }
2016
2017        return error;
2018}
2019
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.