linux/fs/splice.c
<<
>>
Prefs
   1/*
   2 * "splice": joining two ropes together by interweaving their strands.
   3 *
   4 * This is the "extended pipe" functionality, where a pipe is used as
   5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   6 * buffer that you can use to transfer data from one end to the other.
   7 *
   8 * The traditional unix read/write is extended with a "splice()" operation
   9 * that transfers data buffers to or from a pipe buffer.
  10 *
  11 * Named by Larry McVoy, original implementation from Linus, extended by
  12 * Jens to support splicing to files, network, direct splicing, etc and
  13 * fixing lots of bugs.
  14 *
  15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  18 *
  19 */
  20#include <linux/fs.h>
  21#include <linux/file.h>
  22#include <linux/pagemap.h>
  23#include <linux/splice.h>
  24#include <linux/memcontrol.h>
  25#include <linux/mm_inline.h>
  26#include <linux/swap.h>
  27#include <linux/writeback.h>
  28#include <linux/buffer_head.h>
  29#include <linux/module.h>
  30#include <linux/syscalls.h>
  31#include <linux/uio.h>
  32#include <linux/security.h>
  33
  34/*
  35 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  36 * a vm helper function, it's already simplified quite a bit by the
  37 * addition of remove_mapping(). If success is returned, the caller may
  38 * attempt to reuse this page for another destination.
  39 */
  40static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
  41                                     struct pipe_buffer *buf)
  42{
  43        struct page *page = buf->page;
  44        struct address_space *mapping;
  45
  46        lock_page(page);
  47
  48        mapping = page_mapping(page);
  49        if (mapping) {
  50                WARN_ON(!PageUptodate(page));
  51
  52                /*
  53                 * At least for ext2 with nobh option, we need to wait on
  54                 * writeback completing on this page, since we'll remove it
  55                 * from the pagecache.  Otherwise truncate wont wait on the
  56                 * page, allowing the disk blocks to be reused by someone else
  57                 * before we actually wrote our data to them. fs corruption
  58                 * ensues.
  59                 */
  60                wait_on_page_writeback(page);
  61
  62                if (page_has_private(page) &&
  63                    !try_to_release_page(page, GFP_KERNEL))
  64                        goto out_unlock;
  65
  66                /*
  67                 * If we succeeded in removing the mapping, set LRU flag
  68                 * and return good.
  69                 */
  70                if (remove_mapping(mapping, page)) {
  71                        buf->flags |= PIPE_BUF_FLAG_LRU;
  72                        return 0;
  73                }
  74        }
  75
  76        /*
  77         * Raced with truncate or failed to remove page from current
  78         * address space, unlock and return failure.
  79         */
  80out_unlock:
  81        unlock_page(page);
  82        return 1;
  83}
  84
  85static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  86                                        struct pipe_buffer *buf)
  87{
  88        page_cache_release(buf->page);
  89        buf->flags &= ~PIPE_BUF_FLAG_LRU;
  90}
  91
  92/*
  93 * Check whether the contents of buf is OK to access. Since the content
  94 * is a page cache page, IO may be in flight.
  95 */
  96static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
  97                                       struct pipe_buffer *buf)
  98{
  99        struct page *page = buf->page;
 100        int err;
 101
 102        if (!PageUptodate(page)) {
 103                lock_page(page);
 104
 105                /*
 106                 * Page got truncated/unhashed. This will cause a 0-byte
 107                 * splice, if this is the first page.
 108                 */
 109                if (!page->mapping) {
 110                        err = -ENODATA;
 111                        goto error;
 112                }
 113
 114                /*
 115                 * Uh oh, read-error from disk.
 116                 */
 117                if (!PageUptodate(page)) {
 118                        err = -EIO;
 119                        goto error;
 120                }
 121
 122                /*
 123                 * Page is ok afterall, we are done.
 124                 */
 125                unlock_page(page);
 126        }
 127
 128        return 0;
 129error:
 130        unlock_page(page);
 131        return err;
 132}
 133
 134static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 135        .can_merge = 0,
 136        .map = generic_pipe_buf_map,
 137        .unmap = generic_pipe_buf_unmap,
 138        .confirm = page_cache_pipe_buf_confirm,
 139        .release = page_cache_pipe_buf_release,
 140        .steal = page_cache_pipe_buf_steal,
 141        .get = generic_pipe_buf_get,
 142};
 143
 144static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 145                                    struct pipe_buffer *buf)
 146{
 147        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 148                return 1;
 149
 150        buf->flags |= PIPE_BUF_FLAG_LRU;
 151        return generic_pipe_buf_steal(pipe, buf);
 152}
 153
 154static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 155        .can_merge = 0,
 156        .map = generic_pipe_buf_map,
 157        .unmap = generic_pipe_buf_unmap,
 158        .confirm = generic_pipe_buf_confirm,
 159        .release = page_cache_pipe_buf_release,
 160        .steal = user_page_pipe_buf_steal,
 161        .get = generic_pipe_buf_get,
 162};
 163
 164/**
 165 * splice_to_pipe - fill passed data into a pipe
 166 * @pipe:       pipe to fill
 167 * @spd:        data to fill
 168 *
 169 * Description:
 170 *    @spd contains a map of pages and len/offset tuples, along with
 171 *    the struct pipe_buf_operations associated with these pages. This
 172 *    function will link that data to the pipe.
 173 *
 174 */
 175ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 176                       struct splice_pipe_desc *spd)
 177{
 178        unsigned int spd_pages = spd->nr_pages;
 179        int ret, do_wakeup, page_nr;
 180
 181        ret = 0;
 182        do_wakeup = 0;
 183        page_nr = 0;
 184
 185        pipe_lock(pipe);
 186
 187        for (;;) {
 188                if (!pipe->readers) {
 189                        send_sig(SIGPIPE, current, 0);
 190                        if (!ret)
 191                                ret = -EPIPE;
 192                        break;
 193                }
 194
 195                if (pipe->nrbufs < PIPE_BUFFERS) {
 196                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 197                        struct pipe_buffer *buf = pipe->bufs + newbuf;
 198
 199                        buf->page = spd->pages[page_nr];
 200                        buf->offset = spd->partial[page_nr].offset;
 201                        buf->len = spd->partial[page_nr].len;
 202                        buf->private = spd->partial[page_nr].private;
 203                        buf->ops = spd->ops;
 204                        if (spd->flags & SPLICE_F_GIFT)
 205                                buf->flags |= PIPE_BUF_FLAG_GIFT;
 206
 207                        pipe->nrbufs++;
 208                        page_nr++;
 209                        ret += buf->len;
 210
 211                        if (pipe->inode)
 212                                do_wakeup = 1;
 213
 214                        if (!--spd->nr_pages)
 215                                break;
 216                        if (pipe->nrbufs < PIPE_BUFFERS)
 217                                continue;
 218
 219                        break;
 220                }
 221
 222                if (spd->flags & SPLICE_F_NONBLOCK) {
 223                        if (!ret)
 224                                ret = -EAGAIN;
 225                        break;
 226                }
 227
 228                if (signal_pending(current)) {
 229                        if (!ret)
 230                                ret = -ERESTARTSYS;
 231                        break;
 232                }
 233
 234                if (do_wakeup) {
 235                        smp_mb();
 236                        if (waitqueue_active(&pipe->wait))
 237                                wake_up_interruptible_sync(&pipe->wait);
 238                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 239                        do_wakeup = 0;
 240                }
 241
 242                pipe->waiting_writers++;
 243                pipe_wait(pipe);
 244                pipe->waiting_writers--;
 245        }
 246
 247        pipe_unlock(pipe);
 248
 249        if (do_wakeup) {
 250                smp_mb();
 251                if (waitqueue_active(&pipe->wait))
 252                        wake_up_interruptible(&pipe->wait);
 253                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 254        }
 255
 256        while (page_nr < spd_pages)
 257                spd->spd_release(spd, page_nr++);
 258
 259        return ret;
 260}
 261
 262static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 263{
 264        page_cache_release(spd->pages[i]);
 265}
 266
 267static int
 268__generic_file_splice_read(struct file *in, loff_t *ppos,
 269                           struct pipe_inode_info *pipe, size_t len,
 270                           unsigned int flags)
 271{
 272        struct address_space *mapping = in->f_mapping;
 273        unsigned int loff, nr_pages, req_pages;
 274        struct page *pages[PIPE_BUFFERS];
 275        struct partial_page partial[PIPE_BUFFERS];
 276        struct page *page;
 277        pgoff_t index, end_index;
 278        loff_t isize;
 279        int error, page_nr;
 280        struct splice_pipe_desc spd = {
 281                .pages = pages,
 282                .partial = partial,
 283                .flags = flags,
 284                .ops = &page_cache_pipe_buf_ops,
 285                .spd_release = spd_release_page,
 286        };
 287
 288        index = *ppos >> PAGE_CACHE_SHIFT;
 289        loff = *ppos & ~PAGE_CACHE_MASK;
 290        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 291        nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
 292
 293        /*
 294         * Lookup the (hopefully) full range of pages we need.
 295         */
 296        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
 297        index += spd.nr_pages;
 298
 299        /*
 300         * If find_get_pages_contig() returned fewer pages than we needed,
 301         * readahead/allocate the rest and fill in the holes.
 302         */
 303        if (spd.nr_pages < nr_pages)
 304                page_cache_sync_readahead(mapping, &in->f_ra, in,
 305                                index, req_pages - spd.nr_pages);
 306
 307        error = 0;
 308        while (spd.nr_pages < nr_pages) {
 309                /*
 310                 * Page could be there, find_get_pages_contig() breaks on
 311                 * the first hole.
 312                 */
 313                page = find_get_page(mapping, index);
 314                if (!page) {
 315                        /*
 316                         * page didn't exist, allocate one.
 317                         */
 318                        page = page_cache_alloc_cold(mapping);
 319                        if (!page)
 320                                break;
 321
 322                        error = add_to_page_cache_lru(page, mapping, index,
 323                                                mapping_gfp_mask(mapping));
 324                        if (unlikely(error)) {
 325                                page_cache_release(page);
 326                                if (error == -EEXIST)
 327                                        continue;
 328                                break;
 329                        }
 330                        /*
 331                         * add_to_page_cache() locks the page, unlock it
 332                         * to avoid convoluting the logic below even more.
 333                         */
 334                        unlock_page(page);
 335                }
 336
 337                pages[spd.nr_pages++] = page;
 338                index++;
 339        }
 340
 341        /*
 342         * Now loop over the map and see if we need to start IO on any
 343         * pages, fill in the partial map, etc.
 344         */
 345        index = *ppos >> PAGE_CACHE_SHIFT;
 346        nr_pages = spd.nr_pages;
 347        spd.nr_pages = 0;
 348        for (page_nr = 0; page_nr < nr_pages; page_nr++) {
 349                unsigned int this_len;
 350
 351                if (!len)
 352                        break;
 353
 354                /*
 355                 * this_len is the max we'll use from this page
 356                 */
 357                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
 358                page = pages[page_nr];
 359
 360                if (PageReadahead(page))
 361                        page_cache_async_readahead(mapping, &in->f_ra, in,
 362                                        page, index, req_pages - page_nr);
 363
 364                /*
 365                 * If the page isn't uptodate, we may need to start io on it
 366                 */
 367                if (!PageUptodate(page)) {
 368                        /*
 369                         * If in nonblock mode then dont block on waiting
 370                         * for an in-flight io page
 371                         */
 372                        if (flags & SPLICE_F_NONBLOCK) {
 373                                if (!trylock_page(page)) {
 374                                        error = -EAGAIN;
 375                                        break;
 376                                }
 377                        } else
 378                                lock_page(page);
 379
 380                        /*
 381                         * Page was truncated, or invalidated by the
 382                         * filesystem.  Redo the find/create, but this time the
 383                         * page is kept locked, so there's no chance of another
 384                         * race with truncate/invalidate.
 385                         */
 386                        if (!page->mapping) {
 387                                unlock_page(page);
 388                                page = find_or_create_page(mapping, index,
 389                                                mapping_gfp_mask(mapping));
 390
 391                                if (!page) {
 392                                        error = -ENOMEM;
 393                                        break;
 394                                }
 395                                page_cache_release(pages[page_nr]);
 396                                pages[page_nr] = page;
 397                        }
 398                        /*
 399                         * page was already under io and is now done, great
 400                         */
 401                        if (PageUptodate(page)) {
 402                                unlock_page(page);
 403                                goto fill_it;
 404                        }
 405
 406                        /*
 407                         * need to read in the page
 408                         */
 409                        error = mapping->a_ops->readpage(in, page);
 410                        if (unlikely(error)) {
 411                                /*
 412                                 * We really should re-lookup the page here,
 413                                 * but it complicates things a lot. Instead
 414                                 * lets just do what we already stored, and
 415                                 * we'll get it the next time we are called.
 416                                 */
 417                                if (error == AOP_TRUNCATED_PAGE)
 418                                        error = 0;
 419
 420                                break;
 421                        }
 422                }
 423fill_it:
 424                /*
 425                 * i_size must be checked after PageUptodate.
 426                 */
 427                isize = i_size_read(mapping->host);
 428                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 429                if (unlikely(!isize || index > end_index))
 430                        break;
 431
 432                /*
 433                 * if this is the last page, see if we need to shrink
 434                 * the length and stop
 435                 */
 436                if (end_index == index) {
 437                        unsigned int plen;
 438
 439                        /*
 440                         * max good bytes in this page
 441                         */
 442                        plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 443                        if (plen <= loff)
 444                                break;
 445
 446                        /*
 447                         * force quit after adding this page
 448                         */
 449                        this_len = min(this_len, plen - loff);
 450                        len = this_len;
 451                }
 452
 453                partial[page_nr].offset = loff;
 454                partial[page_nr].len = this_len;
 455                len -= this_len;
 456                loff = 0;
 457                spd.nr_pages++;
 458                index++;
 459        }
 460
 461        /*
 462         * Release any pages at the end, if we quit early. 'page_nr' is how far
 463         * we got, 'nr_pages' is how many pages are in the map.
 464         */
 465        while (page_nr < nr_pages)
 466                page_cache_release(pages[page_nr++]);
 467        in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 468
 469        if (spd.nr_pages)
 470                return splice_to_pipe(pipe, &spd);
 471
 472        return error;
 473}
 474
 475/**
 476 * generic_file_splice_read - splice data from file to a pipe
 477 * @in:         file to splice from
 478 * @ppos:       position in @in
 479 * @pipe:       pipe to splice to
 480 * @len:        number of bytes to splice
 481 * @flags:      splice modifier flags
 482 *
 483 * Description:
 484 *    Will read pages from given file and fill them into a pipe. Can be
 485 *    used as long as the address_space operations for the source implements
 486 *    a readpage() hook.
 487 *
 488 */
 489ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 490                                 struct pipe_inode_info *pipe, size_t len,
 491                                 unsigned int flags)
 492{
 493        loff_t isize, left;
 494        int ret;
 495
 496        isize = i_size_read(in->f_mapping->host);
 497        if (unlikely(*ppos >= isize))
 498                return 0;
 499
 500        left = isize - *ppos;
 501        if (unlikely(left < len))
 502                len = left;
 503
 504        ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 505        if (ret > 0)
 506                *ppos += ret;
 507
 508        return ret;
 509}
 510
 511EXPORT_SYMBOL(generic_file_splice_read);
 512
 513/*
 514 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 515 * using sendpage(). Return the number of bytes sent.
 516 */
 517static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 518                            struct pipe_buffer *buf, struct splice_desc *sd)
 519{
 520        struct file *file = sd->u.file;
 521        loff_t pos = sd->pos;
 522        int ret, more;
 523
 524        ret = buf->ops->confirm(pipe, buf);
 525        if (!ret) {
 526                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 527
 528                ret = file->f_op->sendpage(file, buf->page, buf->offset,
 529                                           sd->len, &pos, more);
 530        }
 531
 532        return ret;
 533}
 534
 535/*
 536 * This is a little more tricky than the file -> pipe splicing. There are
 537 * basically three cases:
 538 *
 539 *      - Destination page already exists in the address space and there
 540 *        are users of it. For that case we have no other option that
 541 *        copying the data. Tough luck.
 542 *      - Destination page already exists in the address space, but there
 543 *        are no users of it. Make sure it's uptodate, then drop it. Fall
 544 *        through to last case.
 545 *      - Destination page does not exist, we can add the pipe page to
 546 *        the page cache and avoid the copy.
 547 *
 548 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
 549 * sd->flags), we attempt to migrate pages from the pipe to the output
 550 * file address space page cache. This is possible if no one else has
 551 * the pipe page referenced outside of the pipe and page cache. If
 552 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
 553 * a new page in the output file page cache and fill/dirty that.
 554 */
 555int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 556                 struct splice_desc *sd)
 557{
 558        struct file *file = sd->u.file;
 559        struct address_space *mapping = file->f_mapping;
 560        unsigned int offset, this_len;
 561        struct page *page;
 562        void *fsdata;
 563        int ret;
 564
 565        /*
 566         * make sure the data in this buffer is uptodate
 567         */
 568        ret = buf->ops->confirm(pipe, buf);
 569        if (unlikely(ret))
 570                return ret;
 571
 572        offset = sd->pos & ~PAGE_CACHE_MASK;
 573
 574        this_len = sd->len;
 575        if (this_len + offset > PAGE_CACHE_SIZE)
 576                this_len = PAGE_CACHE_SIZE - offset;
 577
 578        ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
 579                                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 580        if (unlikely(ret))
 581                goto out;
 582
 583        if (buf->page != page) {
 584                /*
 585                 * Careful, ->map() uses KM_USER0!
 586                 */
 587                char *src = buf->ops->map(pipe, buf, 1);
 588                char *dst = kmap_atomic(page, KM_USER1);
 589
 590                memcpy(dst + offset, src + buf->offset, this_len);
 591                flush_dcache_page(page);
 592                kunmap_atomic(dst, KM_USER1);
 593                buf->ops->unmap(pipe, buf, src);
 594        }
 595        ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
 596                                page, fsdata);
 597out:
 598        return ret;
 599}
 600EXPORT_SYMBOL(pipe_to_file);
 601
 602static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 603{
 604        smp_mb();
 605        if (waitqueue_active(&pipe->wait))
 606                wake_up_interruptible(&pipe->wait);
 607        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 608}
 609
 610/**
 611 * splice_from_pipe_feed - feed available data from a pipe to a file
 612 * @pipe:       pipe to splice from
 613 * @sd:         information to @actor
 614 * @actor:      handler that splices the data
 615 *
 616 * Description:
 617 *    This function loops over the pipe and calls @actor to do the
 618 *    actual moving of a single struct pipe_buffer to the desired
 619 *    destination.  It returns when there's no more buffers left in
 620 *    the pipe or if the requested number of bytes (@sd->total_len)
 621 *    have been copied.  It returns a positive number (one) if the
 622 *    pipe needs to be filled with more data, zero if the required
 623 *    number of bytes have been copied and -errno on error.
 624 *
 625 *    This, together with splice_from_pipe_{begin,end,next}, may be
 626 *    used to implement the functionality of __splice_from_pipe() when
 627 *    locking is required around copying the pipe buffers to the
 628 *    destination.
 629 */
 630int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 631                          splice_actor *actor)
 632{
 633        int ret;
 634
 635        while (pipe->nrbufs) {
 636                struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 637                const struct pipe_buf_operations *ops = buf->ops;
 638
 639                sd->len = buf->len;
 640                if (sd->len > sd->total_len)
 641                        sd->len = sd->total_len;
 642
 643                ret = actor(pipe, buf, sd);
 644                if (ret <= 0) {
 645                        if (ret == -ENODATA)
 646                                ret = 0;
 647                        return ret;
 648                }
 649                buf->offset += ret;
 650                buf->len -= ret;
 651
 652                sd->num_spliced += ret;
 653                sd->len -= ret;
 654                sd->pos += ret;
 655                sd->total_len -= ret;
 656
 657                if (!buf->len) {
 658                        buf->ops = NULL;
 659                        ops->release(pipe, buf);
 660                        pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
 661                        pipe->nrbufs--;
 662                        if (pipe->inode)
 663                                sd->need_wakeup = true;
 664                }
 665
 666                if (!sd->total_len)
 667                        return 0;
 668        }
 669
 670        return 1;
 671}
 672EXPORT_SYMBOL(splice_from_pipe_feed);
 673
 674/**
 675 * splice_from_pipe_next - wait for some data to splice from
 676 * @pipe:       pipe to splice from
 677 * @sd:         information about the splice operation
 678 *
 679 * Description:
 680 *    This function will wait for some data and return a positive
 681 *    value (one) if pipe buffers are available.  It will return zero
 682 *    or -errno if no more data needs to be spliced.
 683 */
 684int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 685{
 686        while (!pipe->nrbufs) {
 687                if (!pipe->writers)
 688                        return 0;
 689
 690                if (!pipe->waiting_writers && sd->num_spliced)
 691                        return 0;
 692
 693                if (sd->flags & SPLICE_F_NONBLOCK)
 694                        return -EAGAIN;
 695
 696                if (signal_pending(current))
 697                        return -ERESTARTSYS;
 698
 699                if (sd->need_wakeup) {
 700                        wakeup_pipe_writers(pipe);
 701                        sd->need_wakeup = false;
 702                }
 703
 704                pipe_wait(pipe);
 705        }
 706
 707        return 1;
 708}
 709EXPORT_SYMBOL(splice_from_pipe_next);
 710
 711/**
 712 * splice_from_pipe_begin - start splicing from pipe
 713 * @sd:         information about the splice operation
 714 *
 715 * Description:
 716 *    This function should be called before a loop containing
 717 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 718 *    initialize the necessary fields of @sd.
 719 */
 720void splice_from_pipe_begin(struct splice_desc *sd)
 721{
 722        sd->num_spliced = 0;
 723        sd->need_wakeup = false;
 724}
 725EXPORT_SYMBOL(splice_from_pipe_begin);
 726
 727/**
 728 * splice_from_pipe_end - finish splicing from pipe
 729 * @pipe:       pipe to splice from
 730 * @sd:         information about the splice operation
 731 *
 732 * Description:
 733 *    This function will wake up pipe writers if necessary.  It should
 734 *    be called after a loop containing splice_from_pipe_next() and
 735 *    splice_from_pipe_feed().
 736 */
 737void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 738{
 739        if (sd->need_wakeup)
 740                wakeup_pipe_writers(pipe);
 741}
 742EXPORT_SYMBOL(splice_from_pipe_end);
 743
 744/**
 745 * __splice_from_pipe - splice data from a pipe to given actor
 746 * @pipe:       pipe to splice from
 747 * @sd:         information to @actor
 748 * @actor:      handler that splices the data
 749 *
 750 * Description:
 751 *    This function does little more than loop over the pipe and call
 752 *    @actor to do the actual moving of a single struct pipe_buffer to
 753 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 754 *    pipe_to_user.
 755 *
 756 */
 757ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 758                           splice_actor *actor)
 759{
 760        int ret;
 761
 762        splice_from_pipe_begin(sd);
 763        do {
 764                ret = splice_from_pipe_next(pipe, sd);
 765                if (ret > 0)
 766                        ret = splice_from_pipe_feed(pipe, sd, actor);
 767        } while (ret > 0);
 768        splice_from_pipe_end(pipe, sd);
 769
 770        return sd->num_spliced ? sd->num_spliced : ret;
 771}
 772EXPORT_SYMBOL(__splice_from_pipe);
 773
 774/**
 775 * splice_from_pipe - splice data from a pipe to a file
 776 * @pipe:       pipe to splice from
 777 * @out:        file to splice to
 778 * @ppos:       position in @out
 779 * @len:        how many bytes to splice
 780 * @flags:      splice modifier flags
 781 * @actor:      handler that splices the data
 782 *
 783 * Description:
 784 *    See __splice_from_pipe. This function locks the pipe inode,
 785 *    otherwise it's identical to __splice_from_pipe().
 786 *
 787 */
 788ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 789                         loff_t *ppos, size_t len, unsigned int flags,
 790                         splice_actor *actor)
 791{
 792        ssize_t ret;
 793        struct splice_desc sd = {
 794                .total_len = len,
 795                .flags = flags,
 796                .pos = *ppos,
 797                .u.file = out,
 798        };
 799
 800        pipe_lock(pipe);
 801        ret = __splice_from_pipe(pipe, &sd, actor);
 802        pipe_unlock(pipe);
 803
 804        return ret;
 805}
 806
 807/**
 808 * generic_file_splice_write - splice data from a pipe to a file
 809 * @pipe:       pipe info
 810 * @out:        file to write to
 811 * @ppos:       position in @out
 812 * @len:        number of bytes to splice
 813 * @flags:      splice modifier flags
 814 *
 815 * Description:
 816 *    Will either move or copy pages (determined by @flags options) from
 817 *    the given pipe inode to the given file.
 818 *
 819 */
 820ssize_t
 821generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 822                          loff_t *ppos, size_t len, unsigned int flags)
 823{
 824        struct address_space *mapping = out->f_mapping;
 825        struct inode *inode = mapping->host;
 826        struct splice_desc sd = {
 827                .total_len = len,
 828                .flags = flags,
 829                .pos = *ppos,
 830                .u.file = out,
 831        };
 832        ssize_t ret;
 833
 834        pipe_lock(pipe);
 835
 836        splice_from_pipe_begin(&sd);
 837        do {
 838                ret = splice_from_pipe_next(pipe, &sd);
 839                if (ret <= 0)
 840                        break;
 841
 842                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 843                ret = file_remove_suid(out);
 844                if (!ret)
 845                        ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
 846                mutex_unlock(&inode->i_mutex);
 847        } while (ret > 0);
 848        splice_from_pipe_end(pipe, &sd);
 849
 850        pipe_unlock(pipe);
 851
 852        if (sd.num_spliced)
 853                ret = sd.num_spliced;
 854
 855        if (ret > 0) {
 856                unsigned long nr_pages;
 857
 858                *ppos += ret;
 859                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 860
 861                /*
 862                 * If file or inode is SYNC and we actually wrote some data,
 863                 * sync it.
 864                 */
 865                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
 866                        int err;
 867
 868                        mutex_lock(&inode->i_mutex);
 869                        err = generic_osync_inode(inode, mapping,
 870                                                  OSYNC_METADATA|OSYNC_DATA);
 871                        mutex_unlock(&inode->i_mutex);
 872
 873                        if (err)
 874                                ret = err;
 875                }
 876                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 877        }
 878
 879        return ret;
 880}
 881
 882EXPORT_SYMBOL(generic_file_splice_write);
 883
 884/**
 885 * generic_splice_sendpage - splice data from a pipe to a socket
 886 * @pipe:       pipe to splice from
 887 * @out:        socket to write to
 888 * @ppos:       position in @out
 889 * @len:        number of bytes to splice
 890 * @flags:      splice modifier flags
 891 *
 892 * Description:
 893 *    Will send @len bytes from the pipe to a network socket. No data copying
 894 *    is involved.
 895 *
 896 */
 897ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 898                                loff_t *ppos, size_t len, unsigned int flags)
 899{
 900        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 901}
 902
 903EXPORT_SYMBOL(generic_splice_sendpage);
 904
 905/*
 906 * Attempt to initiate a splice from pipe to file.
 907 */
 908static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 909                           loff_t *ppos, size_t len, unsigned int flags)
 910{
 911        int ret;
 912
 913        if (unlikely(!out->f_op || !out->f_op->splice_write))
 914                return -EINVAL;
 915
 916        if (unlikely(!(out->f_mode & FMODE_WRITE)))
 917                return -EBADF;
 918
 919        if (unlikely(out->f_flags & O_APPEND))
 920                return -EINVAL;
 921
 922        ret = rw_verify_area(WRITE, out, ppos, len);
 923        if (unlikely(ret < 0))
 924                return ret;
 925
 926        return out->f_op->splice_write(pipe, out, ppos, len, flags);
 927}
 928
 929/*
 930 * Attempt to initiate a splice from a file to a pipe.
 931 */
 932static long do_splice_to(struct file *in, loff_t *ppos,
 933                         struct pipe_inode_info *pipe, size_t len,
 934                         unsigned int flags)
 935{
 936        int ret;
 937
 938        if (unlikely(!in->f_op || !in->f_op->splice_read))
 939                return -EINVAL;
 940
 941        if (unlikely(!(in->f_mode & FMODE_READ)))
 942                return -EBADF;
 943
 944        ret = rw_verify_area(READ, in, ppos, len);
 945        if (unlikely(ret < 0))
 946                return ret;
 947
 948        return in->f_op->splice_read(in, ppos, pipe, len, flags);
 949}
 950
 951/**
 952 * splice_direct_to_actor - splices data directly between two non-pipes
 953 * @in:         file to splice from
 954 * @sd:         actor information on where to splice to
 955 * @actor:      handles the data splicing
 956 *
 957 * Description:
 958 *    This is a special case helper to splice directly between two
 959 *    points, without requiring an explicit pipe. Internally an allocated
 960 *    pipe is cached in the process, and reused during the lifetime of
 961 *    that process.
 962 *
 963 */
 964ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 965                               splice_direct_actor *actor)
 966{
 967        struct pipe_inode_info *pipe;
 968        long ret, bytes;
 969        umode_t i_mode;
 970        size_t len;
 971        int i, flags;
 972
 973        /*
 974         * We require the input being a regular file, as we don't want to
 975         * randomly drop data for eg socket -> socket splicing. Use the
 976         * piped splicing for that!
 977         */
 978        i_mode = in->f_path.dentry->d_inode->i_mode;
 979        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
 980                return -EINVAL;
 981
 982        /*
 983         * neither in nor out is a pipe, setup an internal pipe attached to
 984         * 'out' and transfer the wanted data from 'in' to 'out' through that
 985         */
 986        pipe = current->splice_pipe;
 987        if (unlikely(!pipe)) {
 988                pipe = alloc_pipe_info(NULL);
 989                if (!pipe)
 990                        return -ENOMEM;
 991
 992                /*
 993                 * We don't have an immediate reader, but we'll read the stuff
 994                 * out of the pipe right after the splice_to_pipe(). So set
 995                 * PIPE_READERS appropriately.
 996                 */
 997                pipe->readers = 1;
 998
 999                current->splice_pipe = pipe;
1000        }
1001
1002        /*
1003         * Do the splice.
1004         */
1005        ret = 0;
1006        bytes = 0;
1007        len = sd->total_len;
1008        flags = sd->flags;
1009
1010        /*
1011         * Don't block on output, we have to drain the direct pipe.
1012         */
1013        sd->flags &= ~SPLICE_F_NONBLOCK;
1014
1015        while (len) {
1016                size_t read_len;
1017                loff_t pos = sd->pos, prev_pos = pos;
1018
1019                ret = do_splice_to(in, &pos, pipe, len, flags);
1020                if (unlikely(ret <= 0))
1021                        goto out_release;
1022
1023                read_len = ret;
1024                sd->total_len = read_len;
1025
1026                /*
1027                 * NOTE: nonblocking mode only applies to the input. We
1028                 * must not do the output in nonblocking mode as then we
1029                 * could get stuck data in the internal pipe:
1030                 */
1031                ret = actor(pipe, sd);
1032                if (unlikely(ret <= 0)) {
1033                        sd->pos = prev_pos;
1034                        goto out_release;
1035                }
1036
1037                bytes += ret;
1038                len -= ret;
1039                sd->pos = pos;
1040
1041                if (ret < read_len) {
1042                        sd->pos = prev_pos + ret;
1043                        goto out_release;
1044                }
1045        }
1046
1047done:
1048        pipe->nrbufs = pipe->curbuf = 0;
1049        file_accessed(in);
1050        return bytes;
1051
1052out_release:
1053        /*
1054         * If we did an incomplete transfer we must release
1055         * the pipe buffers in question:
1056         */
1057        for (i = 0; i < PIPE_BUFFERS; i++) {
1058                struct pipe_buffer *buf = pipe->bufs + i;
1059
1060                if (buf->ops) {
1061                        buf->ops->release(pipe, buf);
1062                        buf->ops = NULL;
1063                }
1064        }
1065
1066        if (!bytes)
1067                bytes = ret;
1068
1069        goto done;
1070}
1071EXPORT_SYMBOL(splice_direct_to_actor);
1072
1073static int direct_splice_actor(struct pipe_inode_info *pipe,
1074                               struct splice_desc *sd)
1075{
1076        struct file *file = sd->u.file;
1077
1078        return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1079}
1080
1081/**
1082 * do_splice_direct - splices data directly between two files
1083 * @in:         file to splice from
1084 * @ppos:       input file offset
1085 * @out:        file to splice to
1086 * @len:        number of bytes to splice
1087 * @flags:      splice modifier flags
1088 *
1089 * Description:
1090 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1091 *    doing it in the application would incur an extra system call
1092 *    (splice in + splice out, as compared to just sendfile()). So this helper
1093 *    can splice directly through a process-private pipe.
1094 *
1095 */
1096long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1097                      size_t len, unsigned int flags)
1098{
1099        struct splice_desc sd = {
1100                .len            = len,
1101                .total_len      = len,
1102                .flags          = flags,
1103                .pos            = *ppos,
1104                .u.file         = out,
1105        };
1106        long ret;
1107
1108        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1109        if (ret > 0)
1110                *ppos = sd.pos;
1111
1112        return ret;
1113}
1114
1115/*
1116 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1117 * location, so checking ->i_pipe is not enough to verify that this is a
1118 * pipe.
1119 */
1120static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1121{
1122        if (S_ISFIFO(inode->i_mode))
1123                return inode->i_pipe;
1124
1125        return NULL;
1126}
1127
1128/*
1129 * Determine where to splice to/from.
1130 */
1131static long do_splice(struct file *in, loff_t __user *off_in,
1132                      struct file *out, loff_t __user *off_out,
1133                      size_t len, unsigned int flags)
1134{
1135        struct pipe_inode_info *pipe;
1136        loff_t offset, *off;
1137        long ret;
1138
1139        pipe = pipe_info(in->f_path.dentry->d_inode);
1140        if (pipe) {
1141                if (off_in)
1142                        return -ESPIPE;
1143                if (off_out) {
1144                        if (out->f_op->llseek == no_llseek)
1145                                return -EINVAL;
1146                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1147                                return -EFAULT;
1148                        off = &offset;
1149                } else
1150                        off = &out->f_pos;
1151
1152                ret = do_splice_from(pipe, out, off, len, flags);
1153
1154                if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1155                        ret = -EFAULT;
1156
1157                return ret;
1158        }
1159
1160        pipe = pipe_info(out->f_path.dentry->d_inode);
1161        if (pipe) {
1162                if (off_out)
1163                        return -ESPIPE;
1164                if (off_in) {
1165                        if (in->f_op->llseek == no_llseek)
1166                                return -EINVAL;
1167                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1168                                return -EFAULT;
1169                        off = &offset;
1170                } else
1171                        off = &in->f_pos;
1172
1173                ret = do_splice_to(in, off, pipe, len, flags);
1174
1175                if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1176                        ret = -EFAULT;
1177
1178                return ret;
1179        }
1180
1181        return -EINVAL;
1182}
1183
1184/*
1185 * Map an iov into an array of pages and offset/length tupples. With the
1186 * partial_page structure, we can map several non-contiguous ranges into
1187 * our ones pages[] map instead of splitting that operation into pieces.
1188 * Could easily be exported as a generic helper for other users, in which
1189 * case one would probably want to add a 'max_nr_pages' parameter as well.
1190 */
1191static int get_iovec_page_array(const struct iovec __user *iov,
1192                                unsigned int nr_vecs, struct page **pages,
1193                                struct partial_page *partial, int aligned)
1194{
1195        int buffers = 0, error = 0;
1196
1197        while (nr_vecs) {
1198                unsigned long off, npages;
1199                struct iovec entry;
1200                void __user *base;
1201                size_t len;
1202                int i;
1203
1204                error = -EFAULT;
1205                if (copy_from_user(&entry, iov, sizeof(entry)))
1206                        break;
1207
1208                base = entry.iov_base;
1209                len = entry.iov_len;
1210
1211                /*
1212                 * Sanity check this iovec. 0 read succeeds.
1213                 */
1214                error = 0;
1215                if (unlikely(!len))
1216                        break;
1217                error = -EFAULT;
1218                if (!access_ok(VERIFY_READ, base, len))
1219                        break;
1220
1221                /*
1222                 * Get this base offset and number of pages, then map
1223                 * in the user pages.
1224                 */
1225                off = (unsigned long) base & ~PAGE_MASK;
1226
1227                /*
1228                 * If asked for alignment, the offset must be zero and the
1229                 * length a multiple of the PAGE_SIZE.
1230                 */
1231                error = -EINVAL;
1232                if (aligned && (off || len & ~PAGE_MASK))
1233                        break;
1234
1235                npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1236                if (npages > PIPE_BUFFERS - buffers)
1237                        npages = PIPE_BUFFERS - buffers;
1238
1239                error = get_user_pages_fast((unsigned long)base, npages,
1240                                        0, &pages[buffers]);
1241
1242                if (unlikely(error <= 0))
1243                        break;
1244
1245                /*
1246                 * Fill this contiguous range into the partial page map.
1247                 */
1248                for (i = 0; i < error; i++) {
1249                        const int plen = min_t(size_t, len, PAGE_SIZE - off);
1250
1251                        partial[buffers].offset = off;
1252                        partial[buffers].len = plen;
1253
1254                        off = 0;
1255                        len -= plen;
1256                        buffers++;
1257                }
1258
1259                /*
1260                 * We didn't complete this iov, stop here since it probably
1261                 * means we have to move some of this into a pipe to
1262                 * be able to continue.
1263                 */
1264                if (len)
1265                        break;
1266
1267                /*
1268                 * Don't continue if we mapped fewer pages than we asked for,
1269                 * or if we mapped the max number of pages that we have
1270                 * room for.
1271                 */
1272                if (error < npages || buffers == PIPE_BUFFERS)
1273                        break;
1274
1275                nr_vecs--;
1276                iov++;
1277        }
1278
1279        if (buffers)
1280                return buffers;
1281
1282        return error;
1283}
1284
1285static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1286                        struct splice_desc *sd)
1287{
1288        char *src;
1289        int ret;
1290
1291        ret = buf->ops->confirm(pipe, buf);
1292        if (unlikely(ret))
1293                return ret;
1294
1295        /*
1296         * See if we can use the atomic maps, by prefaulting in the
1297         * pages and doing an atomic copy
1298         */
1299        if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1300                src = buf->ops->map(pipe, buf, 1);
1301                ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1302                                                        sd->len);
1303                buf->ops->unmap(pipe, buf, src);
1304                if (!ret) {
1305                        ret = sd->len;
1306                        goto out;
1307                }
1308        }
1309
1310        /*
1311         * No dice, use slow non-atomic map and copy
1312         */
1313        src = buf->ops->map(pipe, buf, 0);
1314
1315        ret = sd->len;
1316        if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1317                ret = -EFAULT;
1318
1319        buf->ops->unmap(pipe, buf, src);
1320out:
1321        if (ret > 0)
1322                sd->u.userptr += ret;
1323        return ret;
1324}
1325
1326/*
1327 * For lack of a better implementation, implement vmsplice() to userspace
1328 * as a simple copy of the pipes pages to the user iov.
1329 */
1330static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1331                             unsigned long nr_segs, unsigned int flags)
1332{
1333        struct pipe_inode_info *pipe;
1334        struct splice_desc sd;
1335        ssize_t size;
1336        int error;
1337        long ret;
1338
1339        pipe = pipe_info(file->f_path.dentry->d_inode);
1340        if (!pipe)
1341                return -EBADF;
1342
1343        pipe_lock(pipe);
1344
1345        error = ret = 0;
1346        while (nr_segs) {
1347                void __user *base;
1348                size_t len;
1349
1350                /*
1351                 * Get user address base and length for this iovec.
1352                 */
1353                error = get_user(base, &iov->iov_base);
1354                if (unlikely(error))
1355                        break;
1356                error = get_user(len, &iov->iov_len);
1357                if (unlikely(error))
1358                        break;
1359
1360                /*
1361                 * Sanity check this iovec. 0 read succeeds.
1362                 */
1363                if (unlikely(!len))
1364                        break;
1365                if (unlikely(!base)) {
1366                        error = -EFAULT;
1367                        break;
1368                }
1369
1370                if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1371                        error = -EFAULT;
1372                        break;
1373                }
1374
1375                sd.len = 0;
1376                sd.total_len = len;
1377                sd.flags = flags;
1378                sd.u.userptr = base;
1379                sd.pos = 0;
1380
1381                size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1382                if (size < 0) {
1383                        if (!ret)
1384                                ret = size;
1385
1386                        break;
1387                }
1388
1389                ret += size;
1390
1391                if (size < len)
1392                        break;
1393
1394                nr_segs--;
1395                iov++;
1396        }
1397
1398        pipe_unlock(pipe);
1399
1400        if (!ret)
1401                ret = error;
1402
1403        return ret;
1404}
1405
1406/*
1407 * vmsplice splices a user address range into a pipe. It can be thought of
1408 * as splice-from-memory, where the regular splice is splice-from-file (or
1409 * to file). In both cases the output is a pipe, naturally.
1410 */
1411static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1412                             unsigned long nr_segs, unsigned int flags)
1413{
1414        struct pipe_inode_info *pipe;
1415        struct page *pages[PIPE_BUFFERS];
1416        struct partial_page partial[PIPE_BUFFERS];
1417        struct splice_pipe_desc spd = {
1418                .pages = pages,
1419                .partial = partial,
1420                .flags = flags,
1421                .ops = &user_page_pipe_buf_ops,
1422                .spd_release = spd_release_page,
1423        };
1424
1425        pipe = pipe_info(file->f_path.dentry->d_inode);
1426        if (!pipe)
1427                return -EBADF;
1428
1429        spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1430                                            flags & SPLICE_F_GIFT);
1431        if (spd.nr_pages <= 0)
1432                return spd.nr_pages;
1433
1434        return splice_to_pipe(pipe, &spd);
1435}
1436
1437/*
1438 * Note that vmsplice only really supports true splicing _from_ user memory
1439 * to a pipe, not the other way around. Splicing from user memory is a simple
1440 * operation that can be supported without any funky alignment restrictions
1441 * or nasty vm tricks. We simply map in the user memory and fill them into
1442 * a pipe. The reverse isn't quite as easy, though. There are two possible
1443 * solutions for that:
1444 *
1445 *      - memcpy() the data internally, at which point we might as well just
1446 *        do a regular read() on the buffer anyway.
1447 *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1448 *        has restriction limitations on both ends of the pipe).
1449 *
1450 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1451 *
1452 */
1453SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1454                unsigned long, nr_segs, unsigned int, flags)
1455{
1456        struct file *file;
1457        long error;
1458        int fput;
1459
1460        if (unlikely(nr_segs > UIO_MAXIOV))
1461                return -EINVAL;
1462        else if (unlikely(!nr_segs))
1463                return 0;
1464
1465        error = -EBADF;
1466        file = fget_light(fd, &fput);
1467        if (file) {
1468                if (file->f_mode & FMODE_WRITE)
1469                        error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1470                else if (file->f_mode & FMODE_READ)
1471                        error = vmsplice_to_user(file, iov, nr_segs, flags);
1472
1473                fput_light(file, fput);
1474        }
1475
1476        return error;
1477}
1478
1479SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1480                int, fd_out, loff_t __user *, off_out,
1481                size_t, len, unsigned int, flags)
1482{
1483        long error;
1484        struct file *in, *out;
1485        int fput_in, fput_out;
1486
1487        if (unlikely(!len))
1488                return 0;
1489
1490        error = -EBADF;
1491        in = fget_light(fd_in, &fput_in);
1492        if (in) {
1493                if (in->f_mode & FMODE_READ) {
1494                        out = fget_light(fd_out, &fput_out);
1495                        if (out) {
1496                                if (out->f_mode & FMODE_WRITE)
1497                                        error = do_splice(in, off_in,
1498                                                          out, off_out,
1499                                                          len, flags);
1500                                fput_light(out, fput_out);
1501                        }
1502                }
1503
1504                fput_light(in, fput_in);
1505        }
1506
1507        return error;
1508}
1509
1510/*
1511 * Make sure there's data to read. Wait for input if we can, otherwise
1512 * return an appropriate error.
1513 */
1514static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1515{
1516        int ret;
1517
1518        /*
1519         * Check ->nrbufs without the inode lock first. This function
1520         * is speculative anyways, so missing one is ok.
1521         */
1522        if (pipe->nrbufs)
1523                return 0;
1524
1525        ret = 0;
1526        pipe_lock(pipe);
1527
1528        while (!pipe->nrbufs) {
1529                if (signal_pending(current)) {
1530                        ret = -ERESTARTSYS;
1531                        break;
1532                }
1533                if (!pipe->writers)
1534                        break;
1535                if (!pipe->waiting_writers) {
1536                        if (flags & SPLICE_F_NONBLOCK) {
1537                                ret = -EAGAIN;
1538                                break;
1539                        }
1540                }
1541                pipe_wait(pipe);
1542        }
1543
1544        pipe_unlock(pipe);
1545        return ret;
1546}
1547
1548/*
1549 * Make sure there's writeable room. Wait for room if we can, otherwise
1550 * return an appropriate error.
1551 */
1552static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1553{
1554        int ret;
1555
1556        /*
1557         * Check ->nrbufs without the inode lock first. This function
1558         * is speculative anyways, so missing one is ok.
1559         */
1560        if (pipe->nrbufs < PIPE_BUFFERS)
1561                return 0;
1562
1563        ret = 0;
1564        pipe_lock(pipe);
1565
1566        while (pipe->nrbufs >= PIPE_BUFFERS) {
1567                if (!pipe->readers) {
1568                        send_sig(SIGPIPE, current, 0);
1569                        ret = -EPIPE;
1570                        break;
1571                }
1572                if (flags & SPLICE_F_NONBLOCK) {
1573                        ret = -EAGAIN;
1574                        break;
1575                }
1576                if (signal_pending(current)) {
1577                        ret = -ERESTARTSYS;
1578                        break;
1579                }
1580                pipe->waiting_writers++;
1581                pipe_wait(pipe);
1582                pipe->waiting_writers--;
1583        }
1584
1585        pipe_unlock(pipe);
1586        return ret;
1587}
1588
1589/*
1590 * Link contents of ipipe to opipe.
1591 */
1592static int link_pipe(struct pipe_inode_info *ipipe,
1593                     struct pipe_inode_info *opipe,
1594                     size_t len, unsigned int flags)
1595{
1596        struct pipe_buffer *ibuf, *obuf;
1597        int ret = 0, i = 0, nbuf;
1598
1599        /*
1600         * Potential ABBA deadlock, work around it by ordering lock
1601         * grabbing by pipe info address. Otherwise two different processes
1602         * could deadlock (one doing tee from A -> B, the other from B -> A).
1603         */
1604        pipe_double_lock(ipipe, opipe);
1605
1606        do {
1607                if (!opipe->readers) {
1608                        send_sig(SIGPIPE, current, 0);
1609                        if (!ret)
1610                                ret = -EPIPE;
1611                        break;
1612                }
1613
1614                /*
1615                 * If we have iterated all input buffers or ran out of
1616                 * output room, break.
1617                 */
1618                if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1619                        break;
1620
1621                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1622                nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1623
1624                /*
1625                 * Get a reference to this pipe buffer,
1626                 * so we can copy the contents over.
1627                 */
1628                ibuf->ops->get(ipipe, ibuf);
1629
1630                obuf = opipe->bufs + nbuf;
1631                *obuf = *ibuf;
1632
1633                /*
1634                 * Don't inherit the gift flag, we need to
1635                 * prevent multiple steals of this page.
1636                 */
1637                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1638
1639                if (obuf->len > len)
1640                        obuf->len = len;
1641
1642                opipe->nrbufs++;
1643                ret += obuf->len;
1644                len -= obuf->len;
1645                i++;
1646        } while (len);
1647
1648        /*
1649         * return EAGAIN if we have the potential of some data in the
1650         * future, otherwise just return 0
1651         */
1652        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1653                ret = -EAGAIN;
1654
1655        pipe_unlock(ipipe);
1656        pipe_unlock(opipe);
1657
1658        /*
1659         * If we put data in the output pipe, wakeup any potential readers.
1660         */
1661        if (ret > 0) {
1662                smp_mb();
1663                if (waitqueue_active(&opipe->wait))
1664                        wake_up_interruptible(&opipe->wait);
1665                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1666        }
1667
1668        return ret;
1669}
1670
1671/*
1672 * This is a tee(1) implementation that works on pipes. It doesn't copy
1673 * any data, it simply references the 'in' pages on the 'out' pipe.
1674 * The 'flags' used are the SPLICE_F_* variants, currently the only
1675 * applicable one is SPLICE_F_NONBLOCK.
1676 */
1677static long do_tee(struct file *in, struct file *out, size_t len,
1678                   unsigned int flags)
1679{
1680        struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1681        struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1682        int ret = -EINVAL;
1683
1684        /*
1685         * Duplicate the contents of ipipe to opipe without actually
1686         * copying the data.
1687         */
1688        if (ipipe && opipe && ipipe != opipe) {
1689                /*
1690                 * Keep going, unless we encounter an error. The ipipe/opipe
1691                 * ordering doesn't really matter.
1692                 */
1693                ret = link_ipipe_prep(ipipe, flags);
1694                if (!ret) {
1695                        ret = link_opipe_prep(opipe, flags);
1696                        if (!ret)
1697                                ret = link_pipe(ipipe, opipe, len, flags);
1698                }
1699        }
1700
1701        return ret;
1702}
1703
1704SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1705{
1706        struct file *in;
1707        int error, fput_in;
1708
1709        if (unlikely(!len))
1710                return 0;
1711
1712        error = -EBADF;
1713        in = fget_light(fdin, &fput_in);
1714        if (in) {
1715                if (in->f_mode & FMODE_READ) {
1716                        int fput_out;
1717                        struct file *out = fget_light(fdout, &fput_out);
1718
1719                        if (out) {
1720                                if (out->f_mode & FMODE_WRITE)
1721                                        error = do_tee(in, out, len, flags);
1722                                fput_light(out, fput_out);
1723                        }
1724                }
1725                fput_light(in, fput_in);
1726        }
1727
1728        return error;
1729}
1730
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.