linux/fs/splice.c
<<
>>
Prefs
   1/*
   2 * "splice": joining two ropes together by interweaving their strands.
   3 *
   4 * This is the "extended pipe" functionality, where a pipe is used as
   5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   6 * buffer that you can use to transfer data from one end to the other.
   7 *
   8 * The traditional unix read/write is extended with a "splice()" operation
   9 * that transfers data buffers to or from a pipe buffer.
  10 *
  11 * Named by Larry McVoy, original implementation from Linus, extended by
  12 * Jens to support splicing to files, network, direct splicing, etc and
  13 * fixing lots of bugs.
  14 *
  15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  18 *
  19 */
  20#include <linux/fs.h>
  21#include <linux/file.h>
  22#include <linux/pagemap.h>
  23#include <linux/splice.h>
  24#include <linux/mm_inline.h>
  25#include <linux/swap.h>
  26#include <linux/writeback.h>
  27#include <linux/buffer_head.h>
  28#include <linux/module.h>
  29#include <linux/syscalls.h>
  30#include <linux/uio.h>
  31#include <linux/security.h>
  32
  33/*
  34 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  35 * a vm helper function, it's already simplified quite a bit by the
  36 * addition of remove_mapping(). If success is returned, the caller may
  37 * attempt to reuse this page for another destination.
  38 */
  39static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
  40                                     struct pipe_buffer *buf)
  41{
  42        struct page *page = buf->page;
  43        struct address_space *mapping;
  44
  45        lock_page(page);
  46
  47        mapping = page_mapping(page);
  48        if (mapping) {
  49                WARN_ON(!PageUptodate(page));
  50
  51                /*
  52                 * At least for ext2 with nobh option, we need to wait on
  53                 * writeback completing on this page, since we'll remove it
  54                 * from the pagecache.  Otherwise truncate wont wait on the
  55                 * page, allowing the disk blocks to be reused by someone else
  56                 * before we actually wrote our data to them. fs corruption
  57                 * ensues.
  58                 */
  59                wait_on_page_writeback(page);
  60
  61                if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
  62                        goto out_unlock;
  63
  64                /*
  65                 * If we succeeded in removing the mapping, set LRU flag
  66                 * and return good.
  67                 */
  68                if (remove_mapping(mapping, page)) {
  69                        buf->flags |= PIPE_BUF_FLAG_LRU;
  70                        return 0;
  71                }
  72        }
  73
  74        /*
  75         * Raced with truncate or failed to remove page from current
  76         * address space, unlock and return failure.
  77         */
  78out_unlock:
  79        unlock_page(page);
  80        return 1;
  81}
  82
  83static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  84                                        struct pipe_buffer *buf)
  85{
  86        page_cache_release(buf->page);
  87        buf->flags &= ~PIPE_BUF_FLAG_LRU;
  88}
  89
  90/*
  91 * Check whether the contents of buf is OK to access. Since the content
  92 * is a page cache page, IO may be in flight.
  93 */
  94static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
  95                                       struct pipe_buffer *buf)
  96{
  97        struct page *page = buf->page;
  98        int err;
  99
 100        if (!PageUptodate(page)) {
 101                lock_page(page);
 102
 103                /*
 104                 * Page got truncated/unhashed. This will cause a 0-byte
 105                 * splice, if this is the first page.
 106                 */
 107                if (!page->mapping) {
 108                        err = -ENODATA;
 109                        goto error;
 110                }
 111
 112                /*
 113                 * Uh oh, read-error from disk.
 114                 */
 115                if (!PageUptodate(page)) {
 116                        err = -EIO;
 117                        goto error;
 118                }
 119
 120                /*
 121                 * Page is ok afterall, we are done.
 122                 */
 123                unlock_page(page);
 124        }
 125
 126        return 0;
 127error:
 128        unlock_page(page);
 129        return err;
 130}
 131
 132static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 133        .can_merge = 0,
 134        .map = generic_pipe_buf_map,
 135        .unmap = generic_pipe_buf_unmap,
 136        .confirm = page_cache_pipe_buf_confirm,
 137        .release = page_cache_pipe_buf_release,
 138        .steal = page_cache_pipe_buf_steal,
 139        .get = generic_pipe_buf_get,
 140};
 141
 142static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 143                                    struct pipe_buffer *buf)
 144{
 145        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 146                return 1;
 147
 148        buf->flags |= PIPE_BUF_FLAG_LRU;
 149        return generic_pipe_buf_steal(pipe, buf);
 150}
 151
 152static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 153        .can_merge = 0,
 154        .map = generic_pipe_buf_map,
 155        .unmap = generic_pipe_buf_unmap,
 156        .confirm = generic_pipe_buf_confirm,
 157        .release = page_cache_pipe_buf_release,
 158        .steal = user_page_pipe_buf_steal,
 159        .get = generic_pipe_buf_get,
 160};
 161
 162/**
 163 * splice_to_pipe - fill passed data into a pipe
 164 * @pipe:       pipe to fill
 165 * @spd:        data to fill
 166 *
 167 * Description:
 168 *    @spd contains a map of pages and len/offset tuples, along with
 169 *    the struct pipe_buf_operations associated with these pages. This
 170 *    function will link that data to the pipe.
 171 *
 172 */
 173ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 174                       struct splice_pipe_desc *spd)
 175{
 176        unsigned int spd_pages = spd->nr_pages;
 177        int ret, do_wakeup, page_nr;
 178
 179        ret = 0;
 180        do_wakeup = 0;
 181        page_nr = 0;
 182
 183        if (pipe->inode)
 184                mutex_lock(&pipe->inode->i_mutex);
 185
 186        for (;;) {
 187                if (!pipe->readers) {
 188                        send_sig(SIGPIPE, current, 0);
 189                        if (!ret)
 190                                ret = -EPIPE;
 191                        break;
 192                }
 193
 194                if (pipe->nrbufs < PIPE_BUFFERS) {
 195                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 196                        struct pipe_buffer *buf = pipe->bufs + newbuf;
 197
 198                        buf->page = spd->pages[page_nr];
 199                        buf->offset = spd->partial[page_nr].offset;
 200                        buf->len = spd->partial[page_nr].len;
 201                        buf->private = spd->partial[page_nr].private;
 202                        buf->ops = spd->ops;
 203                        if (spd->flags & SPLICE_F_GIFT)
 204                                buf->flags |= PIPE_BUF_FLAG_GIFT;
 205
 206                        pipe->nrbufs++;
 207                        page_nr++;
 208                        ret += buf->len;
 209
 210                        if (pipe->inode)
 211                                do_wakeup = 1;
 212
 213                        if (!--spd->nr_pages)
 214                                break;
 215                        if (pipe->nrbufs < PIPE_BUFFERS)
 216                                continue;
 217
 218                        break;
 219                }
 220
 221                if (spd->flags & SPLICE_F_NONBLOCK) {
 222                        if (!ret)
 223                                ret = -EAGAIN;
 224                        break;
 225                }
 226
 227                if (signal_pending(current)) {
 228                        if (!ret)
 229                                ret = -ERESTARTSYS;
 230                        break;
 231                }
 232
 233                if (do_wakeup) {
 234                        smp_mb();
 235                        if (waitqueue_active(&pipe->wait))
 236                                wake_up_interruptible_sync(&pipe->wait);
 237                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 238                        do_wakeup = 0;
 239                }
 240
 241                pipe->waiting_writers++;
 242                pipe_wait(pipe);
 243                pipe->waiting_writers--;
 244        }
 245
 246        if (pipe->inode) {
 247                mutex_unlock(&pipe->inode->i_mutex);
 248
 249                if (do_wakeup) {
 250                        smp_mb();
 251                        if (waitqueue_active(&pipe->wait))
 252                                wake_up_interruptible(&pipe->wait);
 253                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 254                }
 255        }
 256
 257        while (page_nr < spd_pages)
 258                spd->spd_release(spd, page_nr++);
 259
 260        return ret;
 261}
 262
 263static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 264{
 265        page_cache_release(spd->pages[i]);
 266}
 267
 268static int
 269__generic_file_splice_read(struct file *in, loff_t *ppos,
 270                           struct pipe_inode_info *pipe, size_t len,
 271                           unsigned int flags)
 272{
 273        struct address_space *mapping = in->f_mapping;
 274        unsigned int loff, nr_pages, req_pages;
 275        struct page *pages[PIPE_BUFFERS];
 276        struct partial_page partial[PIPE_BUFFERS];
 277        struct page *page;
 278        pgoff_t index, end_index;
 279        loff_t isize;
 280        int error, page_nr;
 281        struct splice_pipe_desc spd = {
 282                .pages = pages,
 283                .partial = partial,
 284                .flags = flags,
 285                .ops = &page_cache_pipe_buf_ops,
 286                .spd_release = spd_release_page,
 287        };
 288
 289        index = *ppos >> PAGE_CACHE_SHIFT;
 290        loff = *ppos & ~PAGE_CACHE_MASK;
 291        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 292        nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
 293
 294        /*
 295         * Lookup the (hopefully) full range of pages we need.
 296         */
 297        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
 298        index += spd.nr_pages;
 299
 300        /*
 301         * If find_get_pages_contig() returned fewer pages than we needed,
 302         * readahead/allocate the rest and fill in the holes.
 303         */
 304        if (spd.nr_pages < nr_pages)
 305                page_cache_sync_readahead(mapping, &in->f_ra, in,
 306                                index, req_pages - spd.nr_pages);
 307
 308        error = 0;
 309        while (spd.nr_pages < nr_pages) {
 310                /*
 311                 * Page could be there, find_get_pages_contig() breaks on
 312                 * the first hole.
 313                 */
 314                page = find_get_page(mapping, index);
 315                if (!page) {
 316                        /*
 317                         * page didn't exist, allocate one.
 318                         */
 319                        page = page_cache_alloc_cold(mapping);
 320                        if (!page)
 321                                break;
 322
 323                        error = add_to_page_cache_lru(page, mapping, index,
 324                                                mapping_gfp_mask(mapping));
 325                        if (unlikely(error)) {
 326                                page_cache_release(page);
 327                                if (error == -EEXIST)
 328                                        continue;
 329                                break;
 330                        }
 331                        /*
 332                         * add_to_page_cache() locks the page, unlock it
 333                         * to avoid convoluting the logic below even more.
 334                         */
 335                        unlock_page(page);
 336                }
 337
 338                pages[spd.nr_pages++] = page;
 339                index++;
 340        }
 341
 342        /*
 343         * Now loop over the map and see if we need to start IO on any
 344         * pages, fill in the partial map, etc.
 345         */
 346        index = *ppos >> PAGE_CACHE_SHIFT;
 347        nr_pages = spd.nr_pages;
 348        spd.nr_pages = 0;
 349        for (page_nr = 0; page_nr < nr_pages; page_nr++) {
 350                unsigned int this_len;
 351
 352                if (!len)
 353                        break;
 354
 355                /*
 356                 * this_len is the max we'll use from this page
 357                 */
 358                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
 359                page = pages[page_nr];
 360
 361                if (PageReadahead(page))
 362                        page_cache_async_readahead(mapping, &in->f_ra, in,
 363                                        page, index, req_pages - page_nr);
 364
 365                /*
 366                 * If the page isn't uptodate, we may need to start io on it
 367                 */
 368                if (!PageUptodate(page)) {
 369                        /*
 370                         * If in nonblock mode then dont block on waiting
 371                         * for an in-flight io page
 372                         */
 373                        if (flags & SPLICE_F_NONBLOCK) {
 374                                if (!trylock_page(page)) {
 375                                        error = -EAGAIN;
 376                                        break;
 377                                }
 378                        } else
 379                                lock_page(page);
 380
 381                        /*
 382                         * Page was truncated, or invalidated by the
 383                         * filesystem.  Redo the find/create, but this time the
 384                         * page is kept locked, so there's no chance of another
 385                         * race with truncate/invalidate.
 386                         */
 387                        if (!page->mapping) {
 388                                unlock_page(page);
 389                                page = find_or_create_page(mapping, index,
 390                                                mapping_gfp_mask(mapping));
 391
 392                                if (!page) {
 393                                        error = -ENOMEM;
 394                                        break;
 395                                }
 396                                page_cache_release(pages[page_nr]);
 397                                pages[page_nr] = page;
 398                        }
 399                        /*
 400                         * page was already under io and is now done, great
 401                         */
 402                        if (PageUptodate(page)) {
 403                                unlock_page(page);
 404                                goto fill_it;
 405                        }
 406
 407                        /*
 408                         * need to read in the page
 409                         */
 410                        error = mapping->a_ops->readpage(in, page);
 411                        if (unlikely(error)) {
 412                                /*
 413                                 * We really should re-lookup the page here,
 414                                 * but it complicates things a lot. Instead
 415                                 * lets just do what we already stored, and
 416                                 * we'll get it the next time we are called.
 417                                 */
 418                                if (error == AOP_TRUNCATED_PAGE)
 419                                        error = 0;
 420
 421                                break;
 422                        }
 423                }
 424fill_it:
 425                /*
 426                 * i_size must be checked after PageUptodate.
 427                 */
 428                isize = i_size_read(mapping->host);
 429                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 430                if (unlikely(!isize || index > end_index))
 431                        break;
 432
 433                /*
 434                 * if this is the last page, see if we need to shrink
 435                 * the length and stop
 436                 */
 437                if (end_index == index) {
 438                        unsigned int plen;
 439
 440                        /*
 441                         * max good bytes in this page
 442                         */
 443                        plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 444                        if (plen <= loff)
 445                                break;
 446
 447                        /*
 448                         * force quit after adding this page
 449                         */
 450                        this_len = min(this_len, plen - loff);
 451                        len = this_len;
 452                }
 453
 454                partial[page_nr].offset = loff;
 455                partial[page_nr].len = this_len;
 456                len -= this_len;
 457                loff = 0;
 458                spd.nr_pages++;
 459                index++;
 460        }
 461
 462        /*
 463         * Release any pages at the end, if we quit early. 'page_nr' is how far
 464         * we got, 'nr_pages' is how many pages are in the map.
 465         */
 466        while (page_nr < nr_pages)
 467                page_cache_release(pages[page_nr++]);
 468        in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 469
 470        if (spd.nr_pages)
 471                return splice_to_pipe(pipe, &spd);
 472
 473        return error;
 474}
 475
 476/**
 477 * generic_file_splice_read - splice data from file to a pipe
 478 * @in:         file to splice from
 479 * @ppos:       position in @in
 480 * @pipe:       pipe to splice to
 481 * @len:        number of bytes to splice
 482 * @flags:      splice modifier flags
 483 *
 484 * Description:
 485 *    Will read pages from given file and fill them into a pipe. Can be
 486 *    used as long as the address_space operations for the source implements
 487 *    a readpage() hook.
 488 *
 489 */
 490ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 491                                 struct pipe_inode_info *pipe, size_t len,
 492                                 unsigned int flags)
 493{
 494        loff_t isize, left;
 495        int ret;
 496
 497        isize = i_size_read(in->f_mapping->host);
 498        if (unlikely(*ppos >= isize))
 499                return 0;
 500
 501        left = isize - *ppos;
 502        if (unlikely(left < len))
 503                len = left;
 504
 505        ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 506        if (ret > 0)
 507                *ppos += ret;
 508
 509        return ret;
 510}
 511
 512EXPORT_SYMBOL(generic_file_splice_read);
 513
 514/*
 515 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 516 * using sendpage(). Return the number of bytes sent.
 517 */
 518static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 519                            struct pipe_buffer *buf, struct splice_desc *sd)
 520{
 521        struct file *file = sd->u.file;
 522        loff_t pos = sd->pos;
 523        int ret, more;
 524
 525        ret = buf->ops->confirm(pipe, buf);
 526        if (!ret) {
 527                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 528
 529                ret = file->f_op->sendpage(file, buf->page, buf->offset,
 530                                           sd->len, &pos, more);
 531        }
 532
 533        return ret;
 534}
 535
 536/*
 537 * This is a little more tricky than the file -> pipe splicing. There are
 538 * basically three cases:
 539 *
 540 *      - Destination page already exists in the address space and there
 541 *        are users of it. For that case we have no other option that
 542 *        copying the data. Tough luck.
 543 *      - Destination page already exists in the address space, but there
 544 *        are no users of it. Make sure it's uptodate, then drop it. Fall
 545 *        through to last case.
 546 *      - Destination page does not exist, we can add the pipe page to
 547 *        the page cache and avoid the copy.
 548 *
 549 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
 550 * sd->flags), we attempt to migrate pages from the pipe to the output
 551 * file address space page cache. This is possible if no one else has
 552 * the pipe page referenced outside of the pipe and page cache. If
 553 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
 554 * a new page in the output file page cache and fill/dirty that.
 555 */
 556static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 557                        struct splice_desc *sd)
 558{
 559        struct file *file = sd->u.file;
 560        struct address_space *mapping = file->f_mapping;
 561        unsigned int offset, this_len;
 562        struct page *page;
 563        void *fsdata;
 564        int ret;
 565
 566        /*
 567         * make sure the data in this buffer is uptodate
 568         */
 569        ret = buf->ops->confirm(pipe, buf);
 570        if (unlikely(ret))
 571                return ret;
 572
 573        offset = sd->pos & ~PAGE_CACHE_MASK;
 574
 575        this_len = sd->len;
 576        if (this_len + offset > PAGE_CACHE_SIZE)
 577                this_len = PAGE_CACHE_SIZE - offset;
 578
 579        ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
 580                                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 581        if (unlikely(ret))
 582                goto out;
 583
 584        if (buf->page != page) {
 585                /*
 586                 * Careful, ->map() uses KM_USER0!
 587                 */
 588                char *src = buf->ops->map(pipe, buf, 1);
 589                char *dst = kmap_atomic(page, KM_USER1);
 590
 591                memcpy(dst + offset, src + buf->offset, this_len);
 592                flush_dcache_page(page);
 593                kunmap_atomic(dst, KM_USER1);
 594                buf->ops->unmap(pipe, buf, src);
 595        }
 596        ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
 597                                page, fsdata);
 598out:
 599        return ret;
 600}
 601
 602/**
 603 * __splice_from_pipe - splice data from a pipe to given actor
 604 * @pipe:       pipe to splice from
 605 * @sd:         information to @actor
 606 * @actor:      handler that splices the data
 607 *
 608 * Description:
 609 *    This function does little more than loop over the pipe and call
 610 *    @actor to do the actual moving of a single struct pipe_buffer to
 611 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 612 *    pipe_to_user.
 613 *
 614 */
 615ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 616                           splice_actor *actor)
 617{
 618        int ret, do_wakeup, err;
 619
 620        ret = 0;
 621        do_wakeup = 0;
 622
 623        for (;;) {
 624                if (pipe->nrbufs) {
 625                        struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 626                        const struct pipe_buf_operations *ops = buf->ops;
 627
 628                        sd->len = buf->len;
 629                        if (sd->len > sd->total_len)
 630                                sd->len = sd->total_len;
 631
 632                        err = actor(pipe, buf, sd);
 633                        if (err <= 0) {
 634                                if (!ret && err != -ENODATA)
 635                                        ret = err;
 636
 637                                break;
 638                        }
 639
 640                        ret += err;
 641                        buf->offset += err;
 642                        buf->len -= err;
 643
 644                        sd->len -= err;
 645                        sd->pos += err;
 646                        sd->total_len -= err;
 647                        if (sd->len)
 648                                continue;
 649
 650                        if (!buf->len) {
 651                                buf->ops = NULL;
 652                                ops->release(pipe, buf);
 653                                pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
 654                                pipe->nrbufs--;
 655                                if (pipe->inode)
 656                                        do_wakeup = 1;
 657                        }
 658
 659                        if (!sd->total_len)
 660                                break;
 661                }
 662
 663                if (pipe->nrbufs)
 664                        continue;
 665                if (!pipe->writers)
 666                        break;
 667                if (!pipe->waiting_writers) {
 668                        if (ret)
 669                                break;
 670                }
 671
 672                if (sd->flags & SPLICE_F_NONBLOCK) {
 673                        if (!ret)
 674                                ret = -EAGAIN;
 675                        break;
 676                }
 677
 678                if (signal_pending(current)) {
 679                        if (!ret)
 680                                ret = -ERESTARTSYS;
 681                        break;
 682                }
 683
 684                if (do_wakeup) {
 685                        smp_mb();
 686                        if (waitqueue_active(&pipe->wait))
 687                                wake_up_interruptible_sync(&pipe->wait);
 688                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 689                        do_wakeup = 0;
 690                }
 691
 692                pipe_wait(pipe);
 693        }
 694
 695        if (do_wakeup) {
 696                smp_mb();
 697                if (waitqueue_active(&pipe->wait))
 698                        wake_up_interruptible(&pipe->wait);
 699                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 700        }
 701
 702        return ret;
 703}
 704EXPORT_SYMBOL(__splice_from_pipe);
 705
 706/**
 707 * splice_from_pipe - splice data from a pipe to a file
 708 * @pipe:       pipe to splice from
 709 * @out:        file to splice to
 710 * @ppos:       position in @out
 711 * @len:        how many bytes to splice
 712 * @flags:      splice modifier flags
 713 * @actor:      handler that splices the data
 714 *
 715 * Description:
 716 *    See __splice_from_pipe. This function locks the input and output inodes,
 717 *    otherwise it's identical to __splice_from_pipe().
 718 *
 719 */
 720ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 721                         loff_t *ppos, size_t len, unsigned int flags,
 722                         splice_actor *actor)
 723{
 724        ssize_t ret;
 725        struct inode *inode = out->f_mapping->host;
 726        struct splice_desc sd = {
 727                .total_len = len,
 728                .flags = flags,
 729                .pos = *ppos,
 730                .u.file = out,
 731        };
 732
 733        /*
 734         * The actor worker might be calling ->write_begin and
 735         * ->write_end. Most of the time, these expect i_mutex to
 736         * be held. Since this may result in an ABBA deadlock with
 737         * pipe->inode, we have to order lock acquiry here.
 738         */
 739        inode_double_lock(inode, pipe->inode);
 740        ret = __splice_from_pipe(pipe, &sd, actor);
 741        inode_double_unlock(inode, pipe->inode);
 742
 743        return ret;
 744}
 745
 746/**
 747 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
 748 * @pipe:       pipe info
 749 * @out:        file to write to
 750 * @ppos:       position in @out
 751 * @len:        number of bytes to splice
 752 * @flags:      splice modifier flags
 753 *
 754 * Description:
 755 *    Will either move or copy pages (determined by @flags options) from
 756 *    the given pipe inode to the given file. The caller is responsible
 757 *    for acquiring i_mutex on both inodes.
 758 *
 759 */
 760ssize_t
 761generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
 762                                 loff_t *ppos, size_t len, unsigned int flags)
 763{
 764        struct address_space *mapping = out->f_mapping;
 765        struct inode *inode = mapping->host;
 766        struct splice_desc sd = {
 767                .total_len = len,
 768                .flags = flags,
 769                .pos = *ppos,
 770                .u.file = out,
 771        };
 772        ssize_t ret;
 773        int err;
 774
 775        err = file_remove_suid(out);
 776        if (unlikely(err))
 777                return err;
 778
 779        ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
 780        if (ret > 0) {
 781                unsigned long nr_pages;
 782
 783                *ppos += ret;
 784                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 785
 786                /*
 787                 * If file or inode is SYNC and we actually wrote some data,
 788                 * sync it.
 789                 */
 790                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
 791                        err = generic_osync_inode(inode, mapping,
 792                                                  OSYNC_METADATA|OSYNC_DATA);
 793
 794                        if (err)
 795                                ret = err;
 796                }
 797                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 798        }
 799
 800        return ret;
 801}
 802
 803EXPORT_SYMBOL(generic_file_splice_write_nolock);
 804
 805/**
 806 * generic_file_splice_write - splice data from a pipe to a file
 807 * @pipe:       pipe info
 808 * @out:        file to write to
 809 * @ppos:       position in @out
 810 * @len:        number of bytes to splice
 811 * @flags:      splice modifier flags
 812 *
 813 * Description:
 814 *    Will either move or copy pages (determined by @flags options) from
 815 *    the given pipe inode to the given file.
 816 *
 817 */
 818ssize_t
 819generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 820                          loff_t *ppos, size_t len, unsigned int flags)
 821{
 822        struct address_space *mapping = out->f_mapping;
 823        struct inode *inode = mapping->host;
 824        struct splice_desc sd = {
 825                .total_len = len,
 826                .flags = flags,
 827                .pos = *ppos,
 828                .u.file = out,
 829        };
 830        ssize_t ret;
 831
 832        inode_double_lock(inode, pipe->inode);
 833        ret = file_remove_suid(out);
 834        if (likely(!ret))
 835                ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
 836        inode_double_unlock(inode, pipe->inode);
 837        if (ret > 0) {
 838                unsigned long nr_pages;
 839
 840                *ppos += ret;
 841                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 842
 843                /*
 844                 * If file or inode is SYNC and we actually wrote some data,
 845                 * sync it.
 846                 */
 847                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
 848                        int err;
 849
 850                        mutex_lock(&inode->i_mutex);
 851                        err = generic_osync_inode(inode, mapping,
 852                                                  OSYNC_METADATA|OSYNC_DATA);
 853                        mutex_unlock(&inode->i_mutex);
 854
 855                        if (err)
 856                                ret = err;
 857                }
 858                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 859        }
 860
 861        return ret;
 862}
 863
 864EXPORT_SYMBOL(generic_file_splice_write);
 865
 866/**
 867 * generic_splice_sendpage - splice data from a pipe to a socket
 868 * @pipe:       pipe to splice from
 869 * @out:        socket to write to
 870 * @ppos:       position in @out
 871 * @len:        number of bytes to splice
 872 * @flags:      splice modifier flags
 873 *
 874 * Description:
 875 *    Will send @len bytes from the pipe to a network socket. No data copying
 876 *    is involved.
 877 *
 878 */
 879ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 880                                loff_t *ppos, size_t len, unsigned int flags)
 881{
 882        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 883}
 884
 885EXPORT_SYMBOL(generic_splice_sendpage);
 886
 887/*
 888 * Attempt to initiate a splice from pipe to file.
 889 */
 890static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 891                           loff_t *ppos, size_t len, unsigned int flags)
 892{
 893        int ret;
 894
 895        if (unlikely(!out->f_op || !out->f_op->splice_write))
 896                return -EINVAL;
 897
 898        if (unlikely(!(out->f_mode & FMODE_WRITE)))
 899                return -EBADF;
 900
 901        if (unlikely(out->f_flags & O_APPEND))
 902                return -EINVAL;
 903
 904        ret = rw_verify_area(WRITE, out, ppos, len);
 905        if (unlikely(ret < 0))
 906                return ret;
 907
 908        return out->f_op->splice_write(pipe, out, ppos, len, flags);
 909}
 910
 911/*
 912 * Attempt to initiate a splice from a file to a pipe.
 913 */
 914static long do_splice_to(struct file *in, loff_t *ppos,
 915                         struct pipe_inode_info *pipe, size_t len,
 916                         unsigned int flags)
 917{
 918        int ret;
 919
 920        if (unlikely(!in->f_op || !in->f_op->splice_read))
 921                return -EINVAL;
 922
 923        if (unlikely(!(in->f_mode & FMODE_READ)))
 924                return -EBADF;
 925
 926        ret = rw_verify_area(READ, in, ppos, len);
 927        if (unlikely(ret < 0))
 928                return ret;
 929
 930        return in->f_op->splice_read(in, ppos, pipe, len, flags);
 931}
 932
 933/**
 934 * splice_direct_to_actor - splices data directly between two non-pipes
 935 * @in:         file to splice from
 936 * @sd:         actor information on where to splice to
 937 * @actor:      handles the data splicing
 938 *
 939 * Description:
 940 *    This is a special case helper to splice directly between two
 941 *    points, without requiring an explicit pipe. Internally an allocated
 942 *    pipe is cached in the process, and reused during the lifetime of
 943 *    that process.
 944 *
 945 */
 946ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 947                               splice_direct_actor *actor)
 948{
 949        struct pipe_inode_info *pipe;
 950        long ret, bytes;
 951        umode_t i_mode;
 952        size_t len;
 953        int i, flags;
 954
 955        /*
 956         * We require the input being a regular file, as we don't want to
 957         * randomly drop data for eg socket -> socket splicing. Use the
 958         * piped splicing for that!
 959         */
 960        i_mode = in->f_path.dentry->d_inode->i_mode;
 961        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
 962                return -EINVAL;
 963
 964        /*
 965         * neither in nor out is a pipe, setup an internal pipe attached to
 966         * 'out' and transfer the wanted data from 'in' to 'out' through that
 967         */
 968        pipe = current->splice_pipe;
 969        if (unlikely(!pipe)) {
 970                pipe = alloc_pipe_info(NULL);
 971                if (!pipe)
 972                        return -ENOMEM;
 973
 974                /*
 975                 * We don't have an immediate reader, but we'll read the stuff
 976                 * out of the pipe right after the splice_to_pipe(). So set
 977                 * PIPE_READERS appropriately.
 978                 */
 979                pipe->readers = 1;
 980
 981                current->splice_pipe = pipe;
 982        }
 983
 984        /*
 985         * Do the splice.
 986         */
 987        ret = 0;
 988        bytes = 0;
 989        len = sd->total_len;
 990        flags = sd->flags;
 991
 992        /*
 993         * Don't block on output, we have to drain the direct pipe.
 994         */
 995        sd->flags &= ~SPLICE_F_NONBLOCK;
 996
 997        while (len) {
 998                size_t read_len;
 999                loff_t pos = sd->pos, prev_pos = pos;
1000
1001                ret = do_splice_to(in, &pos, pipe, len, flags);
1002                if (unlikely(ret <= 0))
1003                        goto out_release;
1004
1005                read_len = ret;
1006                sd->total_len = read_len;
1007
1008                /*
1009                 * NOTE: nonblocking mode only applies to the input. We
1010                 * must not do the output in nonblocking mode as then we
1011                 * could get stuck data in the internal pipe:
1012                 */
1013                ret = actor(pipe, sd);
1014                if (unlikely(ret <= 0)) {
1015                        sd->pos = prev_pos;
1016                        goto out_release;
1017                }
1018
1019                bytes += ret;
1020                len -= ret;
1021                sd->pos = pos;
1022
1023                if (ret < read_len) {
1024                        sd->pos = prev_pos + ret;
1025                        goto out_release;
1026                }
1027        }
1028
1029done:
1030        pipe->nrbufs = pipe->curbuf = 0;
1031        file_accessed(in);
1032        return bytes;
1033
1034out_release:
1035        /*
1036         * If we did an incomplete transfer we must release
1037         * the pipe buffers in question:
1038         */
1039        for (i = 0; i < PIPE_BUFFERS; i++) {
1040                struct pipe_buffer *buf = pipe->bufs + i;
1041
1042                if (buf->ops) {
1043                        buf->ops->release(pipe, buf);
1044                        buf->ops = NULL;
1045                }
1046        }
1047
1048        if (!bytes)
1049                bytes = ret;
1050
1051        goto done;
1052}
1053EXPORT_SYMBOL(splice_direct_to_actor);
1054
1055static int direct_splice_actor(struct pipe_inode_info *pipe,
1056                               struct splice_desc *sd)
1057{
1058        struct file *file = sd->u.file;
1059
1060        return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1061}
1062
1063/**
1064 * do_splice_direct - splices data directly between two files
1065 * @in:         file to splice from
1066 * @ppos:       input file offset
1067 * @out:        file to splice to
1068 * @len:        number of bytes to splice
1069 * @flags:      splice modifier flags
1070 *
1071 * Description:
1072 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1073 *    doing it in the application would incur an extra system call
1074 *    (splice in + splice out, as compared to just sendfile()). So this helper
1075 *    can splice directly through a process-private pipe.
1076 *
1077 */
1078long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1079                      size_t len, unsigned int flags)
1080{
1081        struct splice_desc sd = {
1082                .len            = len,
1083                .total_len      = len,
1084                .flags          = flags,
1085                .pos            = *ppos,
1086                .u.file         = out,
1087        };
1088        long ret;
1089
1090        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1091        if (ret > 0)
1092                *ppos = sd.pos;
1093
1094        return ret;
1095}
1096
1097/*
1098 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1099 * location, so checking ->i_pipe is not enough to verify that this is a
1100 * pipe.
1101 */
1102static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1103{
1104        if (S_ISFIFO(inode->i_mode))
1105                return inode->i_pipe;
1106
1107        return NULL;
1108}
1109
1110/*
1111 * Determine where to splice to/from.
1112 */
1113static long do_splice(struct file *in, loff_t __user *off_in,
1114                      struct file *out, loff_t __user *off_out,
1115                      size_t len, unsigned int flags)
1116{
1117        struct pipe_inode_info *pipe;
1118        loff_t offset, *off;
1119        long ret;
1120
1121        pipe = pipe_info(in->f_path.dentry->d_inode);
1122        if (pipe) {
1123                if (off_in)
1124                        return -ESPIPE;
1125                if (off_out) {
1126                        if (out->f_op->llseek == no_llseek)
1127                                return -EINVAL;
1128                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1129                                return -EFAULT;
1130                        off = &offset;
1131                } else
1132                        off = &out->f_pos;
1133
1134                ret = do_splice_from(pipe, out, off, len, flags);
1135
1136                if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1137                        ret = -EFAULT;
1138
1139                return ret;
1140        }
1141
1142        pipe = pipe_info(out->f_path.dentry->d_inode);
1143        if (pipe) {
1144                if (off_out)
1145                        return -ESPIPE;
1146                if (off_in) {
1147                        if (in->f_op->llseek == no_llseek)
1148                                return -EINVAL;
1149                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1150                                return -EFAULT;
1151                        off = &offset;
1152                } else
1153                        off = &in->f_pos;
1154
1155                ret = do_splice_to(in, off, pipe, len, flags);
1156
1157                if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1158                        ret = -EFAULT;
1159
1160                return ret;
1161        }
1162
1163        return -EINVAL;
1164}
1165
1166/*
1167 * Map an iov into an array of pages and offset/length tupples. With the
1168 * partial_page structure, we can map several non-contiguous ranges into
1169 * our ones pages[] map instead of splitting that operation into pieces.
1170 * Could easily be exported as a generic helper for other users, in which
1171 * case one would probably want to add a 'max_nr_pages' parameter as well.
1172 */
1173static int get_iovec_page_array(const struct iovec __user *iov,
1174                                unsigned int nr_vecs, struct page **pages,
1175                                struct partial_page *partial, int aligned)
1176{
1177        int buffers = 0, error = 0;
1178
1179        while (nr_vecs) {
1180                unsigned long off, npages;
1181                struct iovec entry;
1182                void __user *base;
1183                size_t len;
1184                int i;
1185
1186                error = -EFAULT;
1187                if (copy_from_user(&entry, iov, sizeof(entry)))
1188                        break;
1189
1190                base = entry.iov_base;
1191                len = entry.iov_len;
1192
1193                /*
1194                 * Sanity check this iovec. 0 read succeeds.
1195                 */
1196                error = 0;
1197                if (unlikely(!len))
1198                        break;
1199                error = -EFAULT;
1200                if (!access_ok(VERIFY_READ, base, len))
1201                        break;
1202
1203                /*
1204                 * Get this base offset and number of pages, then map
1205                 * in the user pages.
1206                 */
1207                off = (unsigned long) base & ~PAGE_MASK;
1208
1209                /*
1210                 * If asked for alignment, the offset must be zero and the
1211                 * length a multiple of the PAGE_SIZE.
1212                 */
1213                error = -EINVAL;
1214                if (aligned && (off || len & ~PAGE_MASK))
1215                        break;
1216
1217                npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1218                if (npages > PIPE_BUFFERS - buffers)
1219                        npages = PIPE_BUFFERS - buffers;
1220
1221                error = get_user_pages_fast((unsigned long)base, npages,
1222                                        0, &pages[buffers]);
1223
1224                if (unlikely(error <= 0))
1225                        break;
1226
1227                /*
1228                 * Fill this contiguous range into the partial page map.
1229                 */
1230                for (i = 0; i < error; i++) {
1231                        const int plen = min_t(size_t, len, PAGE_SIZE - off);
1232
1233                        partial[buffers].offset = off;
1234                        partial[buffers].len = plen;
1235
1236                        off = 0;
1237                        len -= plen;
1238                        buffers++;
1239                }
1240
1241                /*
1242                 * We didn't complete this iov, stop here since it probably
1243                 * means we have to move some of this into a pipe to
1244                 * be able to continue.
1245                 */
1246                if (len)
1247                        break;
1248
1249                /*
1250                 * Don't continue if we mapped fewer pages than we asked for,
1251                 * or if we mapped the max number of pages that we have
1252                 * room for.
1253                 */
1254                if (error < npages || buffers == PIPE_BUFFERS)
1255                        break;
1256
1257                nr_vecs--;
1258                iov++;
1259        }
1260
1261        if (buffers)
1262                return buffers;
1263
1264        return error;
1265}
1266
1267static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1268                        struct splice_desc *sd)
1269{
1270        char *src;
1271        int ret;
1272
1273        ret = buf->ops->confirm(pipe, buf);
1274        if (unlikely(ret))
1275                return ret;
1276
1277        /*
1278         * See if we can use the atomic maps, by prefaulting in the
1279         * pages and doing an atomic copy
1280         */
1281        if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1282                src = buf->ops->map(pipe, buf, 1);
1283                ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1284                                                        sd->len);
1285                buf->ops->unmap(pipe, buf, src);
1286                if (!ret) {
1287                        ret = sd->len;
1288                        goto out;
1289                }
1290        }
1291
1292        /*
1293         * No dice, use slow non-atomic map and copy
1294         */
1295        src = buf->ops->map(pipe, buf, 0);
1296
1297        ret = sd->len;
1298        if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1299                ret = -EFAULT;
1300
1301        buf->ops->unmap(pipe, buf, src);
1302out:
1303        if (ret > 0)
1304                sd->u.userptr += ret;
1305        return ret;
1306}
1307
1308/*
1309 * For lack of a better implementation, implement vmsplice() to userspace
1310 * as a simple copy of the pipes pages to the user iov.
1311 */
1312static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1313                             unsigned long nr_segs, unsigned int flags)
1314{
1315        struct pipe_inode_info *pipe;
1316        struct splice_desc sd;
1317        ssize_t size;
1318        int error;
1319        long ret;
1320
1321        pipe = pipe_info(file->f_path.dentry->d_inode);
1322        if (!pipe)
1323                return -EBADF;
1324
1325        if (pipe->inode)
1326                mutex_lock(&pipe->inode->i_mutex);
1327
1328        error = ret = 0;
1329        while (nr_segs) {
1330                void __user *base;
1331                size_t len;
1332
1333                /*
1334                 * Get user address base and length for this iovec.
1335                 */
1336                error = get_user(base, &iov->iov_base);
1337                if (unlikely(error))
1338                        break;
1339                error = get_user(len, &iov->iov_len);
1340                if (unlikely(error))
1341                        break;
1342
1343                /*
1344                 * Sanity check this iovec. 0 read succeeds.
1345                 */
1346                if (unlikely(!len))
1347                        break;
1348                if (unlikely(!base)) {
1349                        error = -EFAULT;
1350                        break;
1351                }
1352
1353                if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1354                        error = -EFAULT;
1355                        break;
1356                }
1357
1358                sd.len = 0;
1359                sd.total_len = len;
1360                sd.flags = flags;
1361                sd.u.userptr = base;
1362                sd.pos = 0;
1363
1364                size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1365                if (size < 0) {
1366                        if (!ret)
1367                                ret = size;
1368
1369                        break;
1370                }
1371
1372                ret += size;
1373
1374                if (size < len)
1375                        break;
1376
1377                nr_segs--;
1378                iov++;
1379        }
1380
1381        if (pipe->inode)
1382                mutex_unlock(&pipe->inode->i_mutex);
1383
1384        if (!ret)
1385                ret = error;
1386
1387        return ret;
1388}
1389
1390/*
1391 * vmsplice splices a user address range into a pipe. It can be thought of
1392 * as splice-from-memory, where the regular splice is splice-from-file (or
1393 * to file). In both cases the output is a pipe, naturally.
1394 */
1395static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1396                             unsigned long nr_segs, unsigned int flags)
1397{
1398        struct pipe_inode_info *pipe;
1399        struct page *pages[PIPE_BUFFERS];
1400        struct partial_page partial[PIPE_BUFFERS];
1401        struct splice_pipe_desc spd = {
1402                .pages = pages,
1403                .partial = partial,
1404                .flags = flags,
1405                .ops = &user_page_pipe_buf_ops,
1406                .spd_release = spd_release_page,
1407        };
1408
1409        pipe = pipe_info(file->f_path.dentry->d_inode);
1410        if (!pipe)
1411                return -EBADF;
1412
1413        spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1414                                            flags & SPLICE_F_GIFT);
1415        if (spd.nr_pages <= 0)
1416                return spd.nr_pages;
1417
1418        return splice_to_pipe(pipe, &spd);
1419}
1420
1421/*
1422 * Note that vmsplice only really supports true splicing _from_ user memory
1423 * to a pipe, not the other way around. Splicing from user memory is a simple
1424 * operation that can be supported without any funky alignment restrictions
1425 * or nasty vm tricks. We simply map in the user memory and fill them into
1426 * a pipe. The reverse isn't quite as easy, though. There are two possible
1427 * solutions for that:
1428 *
1429 *      - memcpy() the data internally, at which point we might as well just
1430 *        do a regular read() on the buffer anyway.
1431 *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1432 *        has restriction limitations on both ends of the pipe).
1433 *
1434 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1435 *
1436 */
1437SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1438                unsigned long, nr_segs, unsigned int, flags)
1439{
1440        struct file *file;
1441        long error;
1442        int fput;
1443
1444        if (unlikely(nr_segs > UIO_MAXIOV))
1445                return -EINVAL;
1446        else if (unlikely(!nr_segs))
1447                return 0;
1448
1449        error = -EBADF;
1450        file = fget_light(fd, &fput);
1451        if (file) {
1452                if (file->f_mode & FMODE_WRITE)
1453                        error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1454                else if (file->f_mode & FMODE_READ)
1455                        error = vmsplice_to_user(file, iov, nr_segs, flags);
1456
1457                fput_light(file, fput);
1458        }
1459
1460        return error;
1461}
1462
1463SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1464                int, fd_out, loff_t __user *, off_out,
1465                size_t, len, unsigned int, flags)
1466{
1467        long error;
1468        struct file *in, *out;
1469        int fput_in, fput_out;
1470
1471        if (unlikely(!len))
1472                return 0;
1473
1474        error = -EBADF;
1475        in = fget_light(fd_in, &fput_in);
1476        if (in) {
1477                if (in->f_mode & FMODE_READ) {
1478                        out = fget_light(fd_out, &fput_out);
1479                        if (out) {
1480                                if (out->f_mode & FMODE_WRITE)
1481                                        error = do_splice(in, off_in,
1482                                                          out, off_out,
1483                                                          len, flags);
1484                                fput_light(out, fput_out);
1485                        }
1486                }
1487
1488                fput_light(in, fput_in);
1489        }
1490
1491        return error;
1492}
1493
1494/*
1495 * Make sure there's data to read. Wait for input if we can, otherwise
1496 * return an appropriate error.
1497 */
1498static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1499{
1500        int ret;
1501
1502        /*
1503         * Check ->nrbufs without the inode lock first. This function
1504         * is speculative anyways, so missing one is ok.
1505         */
1506        if (pipe->nrbufs)
1507                return 0;
1508
1509        ret = 0;
1510        mutex_lock(&pipe->inode->i_mutex);
1511
1512        while (!pipe->nrbufs) {
1513                if (signal_pending(current)) {
1514                        ret = -ERESTARTSYS;
1515                        break;
1516                }
1517                if (!pipe->writers)
1518                        break;
1519                if (!pipe->waiting_writers) {
1520                        if (flags & SPLICE_F_NONBLOCK) {
1521                                ret = -EAGAIN;
1522                                break;
1523                        }
1524                }
1525                pipe_wait(pipe);
1526        }
1527
1528        mutex_unlock(&pipe->inode->i_mutex);
1529        return ret;
1530}
1531
1532/*
1533 * Make sure there's writeable room. Wait for room if we can, otherwise
1534 * return an appropriate error.
1535 */
1536static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1537{
1538        int ret;
1539
1540        /*
1541         * Check ->nrbufs without the inode lock first. This function
1542         * is speculative anyways, so missing one is ok.
1543         */
1544        if (pipe->nrbufs < PIPE_BUFFERS)
1545                return 0;
1546
1547        ret = 0;
1548        mutex_lock(&pipe->inode->i_mutex);
1549
1550        while (pipe->nrbufs >= PIPE_BUFFERS) {
1551                if (!pipe->readers) {
1552                        send_sig(SIGPIPE, current, 0);
1553                        ret = -EPIPE;
1554                        break;
1555                }
1556                if (flags & SPLICE_F_NONBLOCK) {
1557                        ret = -EAGAIN;
1558                        break;
1559                }
1560                if (signal_pending(current)) {
1561                        ret = -ERESTARTSYS;
1562                        break;
1563                }
1564                pipe->waiting_writers++;
1565                pipe_wait(pipe);
1566                pipe->waiting_writers--;
1567        }
1568
1569        mutex_unlock(&pipe->inode->i_mutex);
1570        return ret;
1571}
1572
1573/*
1574 * Link contents of ipipe to opipe.
1575 */
1576static int link_pipe(struct pipe_inode_info *ipipe,
1577                     struct pipe_inode_info *opipe,
1578                     size_t len, unsigned int flags)
1579{
1580        struct pipe_buffer *ibuf, *obuf;
1581        int ret = 0, i = 0, nbuf;
1582
1583        /*
1584         * Potential ABBA deadlock, work around it by ordering lock
1585         * grabbing by inode address. Otherwise two different processes
1586         * could deadlock (one doing tee from A -> B, the other from B -> A).
1587         */
1588        inode_double_lock(ipipe->inode, opipe->inode);
1589
1590        do {
1591                if (!opipe->readers) {
1592                        send_sig(SIGPIPE, current, 0);
1593                        if (!ret)
1594                                ret = -EPIPE;
1595                        break;
1596                }
1597
1598                /*
1599                 * If we have iterated all input buffers or ran out of
1600                 * output room, break.
1601                 */
1602                if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1603                        break;
1604
1605                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1606                nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1607
1608                /*
1609                 * Get a reference to this pipe buffer,
1610                 * so we can copy the contents over.
1611                 */
1612                ibuf->ops->get(ipipe, ibuf);
1613
1614                obuf = opipe->bufs + nbuf;
1615                *obuf = *ibuf;
1616
1617                /*
1618                 * Don't inherit the gift flag, we need to
1619                 * prevent multiple steals of this page.
1620                 */
1621                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1622
1623                if (obuf->len > len)
1624                        obuf->len = len;
1625
1626                opipe->nrbufs++;
1627                ret += obuf->len;
1628                len -= obuf->len;
1629                i++;
1630        } while (len);
1631
1632        /*
1633         * return EAGAIN if we have the potential of some data in the
1634         * future, otherwise just return 0
1635         */
1636        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1637                ret = -EAGAIN;
1638
1639        inode_double_unlock(ipipe->inode, opipe->inode);
1640
1641        /*
1642         * If we put data in the output pipe, wakeup any potential readers.
1643         */
1644        if (ret > 0) {
1645                smp_mb();
1646                if (waitqueue_active(&opipe->wait))
1647                        wake_up_interruptible(&opipe->wait);
1648                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1649        }
1650
1651        return ret;
1652}
1653
1654/*
1655 * This is a tee(1) implementation that works on pipes. It doesn't copy
1656 * any data, it simply references the 'in' pages on the 'out' pipe.
1657 * The 'flags' used are the SPLICE_F_* variants, currently the only
1658 * applicable one is SPLICE_F_NONBLOCK.
1659 */
1660static long do_tee(struct file *in, struct file *out, size_t len,
1661                   unsigned int flags)
1662{
1663        struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1664        struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1665        int ret = -EINVAL;
1666
1667        /*
1668         * Duplicate the contents of ipipe to opipe without actually
1669         * copying the data.
1670         */
1671        if (ipipe && opipe && ipipe != opipe) {
1672                /*
1673                 * Keep going, unless we encounter an error. The ipipe/opipe
1674                 * ordering doesn't really matter.
1675                 */
1676                ret = link_ipipe_prep(ipipe, flags);
1677                if (!ret) {
1678                        ret = link_opipe_prep(opipe, flags);
1679                        if (!ret)
1680                                ret = link_pipe(ipipe, opipe, len, flags);
1681                }
1682        }
1683
1684        return ret;
1685}
1686
1687SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1688{
1689        struct file *in;
1690        int error, fput_in;
1691
1692        if (unlikely(!len))
1693                return 0;
1694
1695        error = -EBADF;
1696        in = fget_light(fdin, &fput_in);
1697        if (in) {
1698                if (in->f_mode & FMODE_READ) {
1699                        int fput_out;
1700                        struct file *out = fget_light(fdout, &fput_out);
1701
1702                        if (out) {
1703                                if (out->f_mode & FMODE_WRITE)
1704                                        error = do_tee(in, out, len, flags);
1705                                fput_light(out, fput_out);
1706                        }
1707                }
1708                fput_light(in, fput_in);
1709        }
1710
1711        return error;
1712}
1713