darwin-xnu/osfmk/vm/vm_pageout.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3 *
   4 * @APPLE_LICENSE_HEADER_START@
   5 * 
   6 * The contents of this file constitute Original Code as defined in and
   7 * are subject to the Apple Public Source License Version 1.1 (the
   8 * "License").  You may not use this file except in compliance with the
   9 * License.  Please obtain a copy of the License at
  10 * http://www.apple.com/publicsource and read it before using this file.
  11 * 
  12 * This Original Code and all software distributed under the License are
  13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17 * License for the specific language governing rights and limitations
  18 * under the License.
  19 * 
  20 * @APPLE_LICENSE_HEADER_END@
  21 */
  22/*
  23 * @OSF_COPYRIGHT@
  24 */
  25/* 
  26 * Mach Operating System
  27 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  28 * All Rights Reserved.
  29 * 
  30 * Permission to use, copy, modify and distribute this software and its
  31 * documentation is hereby granted, provided that both the copyright
  32 * notice and this permission notice appear in all copies of the
  33 * software, derivative works or modified versions, and any portions
  34 * thereof, and that both notices appear in supporting documentation.
  35 * 
  36 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  37 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  38 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  39 * 
  40 * Carnegie Mellon requests users of this software to return to
  41 * 
  42 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  43 *  School of Computer Science
  44 *  Carnegie Mellon University
  45 *  Pittsburgh PA 15213-3890
  46 * 
  47 * any improvements or extensions that they make and grant Carnegie Mellon
  48 * the rights to redistribute these changes.
  49 */
  50/*
  51 */
  52/*
  53 *      File:   vm/vm_pageout.c
  54 *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  55 *      Date:   1985
  56 *
  57 *      The proverbial page-out daemon.
  58 */
  59
  60#include <stdint.h>
  61
  62#include <debug.h>
  63#include <mach_pagemap.h>
  64#include <mach_cluster_stats.h>
  65#include <mach_kdb.h>
  66#include <advisory_pageout.h>
  67
  68#include <mach/mach_types.h>
  69#include <mach/memory_object.h>
  70#include <mach/memory_object_default.h>
  71#include <mach/memory_object_control_server.h>
  72#include <mach/mach_host_server.h>
  73#include <mach/upl.h>
  74#include <mach/vm_map.h>
  75#include <mach/vm_param.h>
  76#include <mach/vm_statistics.h>
  77
  78#include <kern/kern_types.h>
  79#include <kern/counters.h>
  80#include <kern/host_statistics.h>
  81#include <kern/machine.h>
  82#include <kern/misc_protos.h>
  83#include <kern/thread.h>
  84#include <kern/xpr.h>
  85#include <kern/kalloc.h>
  86
  87#include <machine/vm_tuning.h>
  88
  89#include <vm/pmap.h>
  90#include <vm/vm_fault.h>
  91#include <vm/vm_map.h>
  92#include <vm/vm_object.h>
  93#include <vm/vm_page.h>
  94#include <vm/vm_pageout.h>
  95#include <vm/vm_protos.h> /* must be last */
  96
  97/*
  98 * ENCRYPTED SWAP:
  99 */
 100#ifdef __ppc__
 101#include <ppc/mappings.h>
 102#endif /* __ppc__ */
 103#include <../bsd/crypto/aes/aes.h>
 104
 105extern ipc_port_t       memory_manager_default;
 106
 107
 108#ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
 109#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  10000  /* maximum iterations of the active queue to move pages to inactive */
 110#endif
 111
 112#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
 113#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 114#endif
 115
 116#ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 117#define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 118#endif
 119
 120#ifndef VM_PAGEOUT_INACTIVE_RELIEF
 121#define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 122#endif
 123
 124#ifndef VM_PAGE_LAUNDRY_MAX
 125#define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 126#endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 127
 128#ifndef VM_PAGEOUT_BURST_WAIT
 129#define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 130#endif  /* VM_PAGEOUT_BURST_WAIT */
 131
 132#ifndef VM_PAGEOUT_EMPTY_WAIT
 133#define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 134#endif  /* VM_PAGEOUT_EMPTY_WAIT */
 135
 136#ifndef VM_PAGEOUT_DEADLOCK_WAIT
 137#define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 138#endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 139
 140#ifndef VM_PAGEOUT_IDLE_WAIT
 141#define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 142#endif  /* VM_PAGEOUT_IDLE_WAIT */
 143
 144
 145/*
 146 *      To obtain a reasonable LRU approximation, the inactive queue
 147 *      needs to be large enough to give pages on it a chance to be
 148 *      referenced a second time.  This macro defines the fraction
 149 *      of active+inactive pages that should be inactive.
 150 *      The pageout daemon uses it to update vm_page_inactive_target.
 151 *
 152 *      If vm_page_free_count falls below vm_page_free_target and
 153 *      vm_page_inactive_count is below vm_page_inactive_target,
 154 *      then the pageout daemon starts running.
 155 */
 156
 157#ifndef VM_PAGE_INACTIVE_TARGET
 158#define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 159#endif  /* VM_PAGE_INACTIVE_TARGET */
 160
 161/*
 162 *      Once the pageout daemon starts running, it keeps going
 163 *      until vm_page_free_count meets or exceeds vm_page_free_target.
 164 */
 165
 166#ifndef VM_PAGE_FREE_TARGET
 167#define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 168#endif  /* VM_PAGE_FREE_TARGET */
 169
 170/*
 171 *      The pageout daemon always starts running once vm_page_free_count
 172 *      falls below vm_page_free_min.
 173 */
 174
 175#ifndef VM_PAGE_FREE_MIN
 176#define VM_PAGE_FREE_MIN(free)  (10 + (free) / 100)
 177#endif  /* VM_PAGE_FREE_MIN */
 178
 179/*
 180 *      When vm_page_free_count falls below vm_page_free_reserved,
 181 *      only vm-privileged threads can allocate pages.  vm-privilege
 182 *      allows the pageout daemon and default pager (and any other
 183 *      associated threads needed for default pageout) to continue
 184 *      operation by dipping into the reserved pool of pages.
 185 */
 186
 187#ifndef VM_PAGE_FREE_RESERVED
 188#define VM_PAGE_FREE_RESERVED(n)        \
 189        ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
 190#endif  /* VM_PAGE_FREE_RESERVED */
 191
 192
 193/*
 194 * must hold the page queues lock to
 195 * manipulate this structure
 196 */
 197struct vm_pageout_queue {
 198        queue_head_t    pgo_pending;    /* laundry pages to be processed by pager's iothread */
 199        unsigned int    pgo_laundry;    /* current count of laundry pages on queue or in flight */
 200        unsigned int    pgo_maxlaundry;
 201
 202        unsigned int    pgo_idle:1,     /* iothread is blocked waiting for work to do */
 203                        pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
 204                        pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
 205                        :0;
 206};
 207
 208#define VM_PAGE_Q_THROTTLED(q)          \
 209        ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
 210
 211
 212/*
 213 * Exported variable used to broadcast the activation of the pageout scan
 214 * Working Set uses this to throttle its use of pmap removes.  In this
 215 * way, code which runs within memory in an uncontested context does
 216 * not keep encountering soft faults.
 217 */
 218
 219unsigned int    vm_pageout_scan_event_counter = 0;
 220
 221/*
 222 * Forward declarations for internal routines.
 223 */
 224
 225static void vm_pageout_garbage_collect(int);
 226static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 227static void vm_pageout_iothread_external(void);
 228static void vm_pageout_iothread_internal(void);
 229static void vm_pageout_queue_steal(vm_page_t);
 230
 231extern void vm_pageout_continue(void);
 232extern void vm_pageout_scan(void);
 233
 234unsigned int vm_pageout_reserved_internal = 0;
 235unsigned int vm_pageout_reserved_really = 0;
 236
 237unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 238unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 239unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 240unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 241unsigned int vm_pageout_deadlock_relief = 0;
 242unsigned int vm_pageout_inactive_relief = 0;
 243unsigned int vm_pageout_burst_active_throttle = 0;
 244unsigned int vm_pageout_burst_inactive_throttle = 0;
 245
 246/*
 247 *      Protection against zero fill flushing live working sets derived
 248 *      from existing backing store and files
 249 */
 250unsigned int vm_accellerate_zf_pageout_trigger = 400;
 251unsigned int vm_zf_iterator;
 252unsigned int vm_zf_iterator_count = 40;
 253unsigned int last_page_zf;
 254unsigned int vm_zf_count = 0;
 255
 256/*
 257 *      These variables record the pageout daemon's actions:
 258 *      how many pages it looks at and what happens to those pages.
 259 *      No locking needed because only one thread modifies the variables.
 260 */
 261
 262unsigned int vm_pageout_active = 0;             /* debugging */
 263unsigned int vm_pageout_inactive = 0;           /* debugging */
 264unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 265unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 266unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 267unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 268unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 269unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 270unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 271unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 272unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 273unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 274unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 275unsigned int vm_stat_discard = 0;               /* debugging */
 276unsigned int vm_stat_discard_sent = 0;          /* debugging */
 277unsigned int vm_stat_discard_failure = 0;       /* debugging */
 278unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 279
 280unsigned int vm_pageout_scan_active_throttled = 0;
 281unsigned int vm_pageout_scan_inactive_throttled = 0;
 282unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 283unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 284unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 285unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 286unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 287unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 288/*
 289 * Backing store throttle when BS is exhausted
 290 */
 291unsigned int    vm_backing_store_low = 0;
 292
 293unsigned int vm_pageout_out_of_line  = 0;
 294unsigned int vm_pageout_in_place  = 0;
 295
 296/*
 297 * ENCRYPTED SWAP:
 298 * counters and statistics...
 299 */
 300unsigned long vm_page_decrypt_counter = 0;
 301unsigned long vm_page_decrypt_for_upl_counter = 0;
 302unsigned long vm_page_encrypt_counter = 0;
 303unsigned long vm_page_encrypt_abort_counter = 0;
 304unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 305boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 306
 307
 308struct  vm_pageout_queue vm_pageout_queue_internal;
 309struct  vm_pageout_queue vm_pageout_queue_external;
 310
 311
 312/*
 313 *      Routine:        vm_backing_store_disable
 314 *      Purpose:
 315 *              Suspend non-privileged threads wishing to extend
 316 *              backing store when we are low on backing store
 317 *              (Synchronized by caller)
 318 */
 319void
 320vm_backing_store_disable(
 321        boolean_t       disable)
 322{
 323        if(disable) {
 324                vm_backing_store_low = 1;
 325        } else {
 326                if(vm_backing_store_low) {
 327                        vm_backing_store_low = 0;
 328                        thread_wakeup((event_t) &vm_backing_store_low);
 329                }
 330        }
 331}
 332
 333
 334/*
 335 *      Routine:        vm_pageout_object_allocate
 336 *      Purpose:
 337 *              Allocate an object for use as out-of-line memory in a
 338 *              data_return/data_initialize message.
 339 *              The page must be in an unlocked object.
 340 *
 341 *              If the page belongs to a trusted pager, cleaning in place
 342 *              will be used, which utilizes a special "pageout object"
 343 *              containing private alias pages for the real page frames.
 344 *              Untrusted pagers use normal out-of-line memory.
 345 */
 346vm_object_t
 347vm_pageout_object_allocate(
 348        vm_page_t               m,
 349        vm_size_t               size,
 350        vm_object_offset_t      offset)
 351{
 352        vm_object_t     object = m->object;
 353        vm_object_t     new_object;
 354
 355        assert(object->pager_ready);
 356
 357        new_object = vm_object_allocate(size);
 358
 359        if (object->pager_trusted) {
 360                assert (offset < object->size);
 361
 362                vm_object_lock(new_object);
 363                new_object->pageout = TRUE;
 364                new_object->shadow = object;
 365                new_object->can_persist = FALSE;
 366                new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 367                new_object->shadow_offset = offset;
 368                vm_object_unlock(new_object);
 369
 370                /*
 371                 * Take a paging reference on the object. This will be dropped
 372                 * in vm_pageout_object_terminate()
 373                 */
 374                vm_object_lock(object);
 375                vm_object_paging_begin(object);
 376                vm_page_lock_queues();
 377                vm_page_unlock_queues();
 378                vm_object_unlock(object);
 379
 380                vm_pageout_in_place++;
 381        } else
 382                vm_pageout_out_of_line++;
 383        return(new_object);
 384}
 385
 386#if MACH_CLUSTER_STATS
 387unsigned long vm_pageout_cluster_dirtied = 0;
 388unsigned long vm_pageout_cluster_cleaned = 0;
 389unsigned long vm_pageout_cluster_collisions = 0;
 390unsigned long vm_pageout_cluster_clusters = 0;
 391unsigned long vm_pageout_cluster_conversions = 0;
 392unsigned long vm_pageout_target_collisions = 0;
 393unsigned long vm_pageout_target_page_dirtied = 0;
 394unsigned long vm_pageout_target_page_freed = 0;
 395#define CLUSTER_STAT(clause)    clause
 396#else   /* MACH_CLUSTER_STATS */
 397#define CLUSTER_STAT(clause)
 398#endif  /* MACH_CLUSTER_STATS */
 399
 400/* 
 401 *      Routine:        vm_pageout_object_terminate
 402 *      Purpose:
 403 *              Destroy the pageout_object allocated by
 404 *              vm_pageout_object_allocate(), and perform all of the
 405 *              required cleanup actions.
 406 * 
 407 *      In/Out conditions:
 408 *              The object must be locked, and will be returned locked.
 409 */
 410void
 411vm_pageout_object_terminate(
 412        vm_object_t     object)
 413{
 414        vm_object_t     shadow_object;
 415        boolean_t       shadow_internal;
 416
 417        /*
 418         * Deal with the deallocation (last reference) of a pageout object
 419         * (used for cleaning-in-place) by dropping the paging references/
 420         * freeing pages in the original object.
 421         */
 422
 423        assert(object->pageout);
 424        shadow_object = object->shadow;
 425        vm_object_lock(shadow_object);
 426        shadow_internal = shadow_object->internal;
 427
 428        while (!queue_empty(&object->memq)) {
 429                vm_page_t               p, m;
 430                vm_object_offset_t      offset;
 431
 432                p = (vm_page_t) queue_first(&object->memq);
 433
 434                assert(p->private);
 435                assert(p->pageout);
 436                p->pageout = FALSE;
 437                assert(!p->cleaning);
 438
 439                offset = p->offset;
 440                VM_PAGE_FREE(p);
 441                p = VM_PAGE_NULL;
 442
 443                m = vm_page_lookup(shadow_object,
 444                        offset + object->shadow_offset);
 445
 446                if(m == VM_PAGE_NULL)
 447                        continue;
 448                assert(m->cleaning);
 449                /* used as a trigger on upl_commit etc to recognize the */
 450                /* pageout daemon's subseqent desire to pageout a cleaning */
 451                /* page.  When the bit is on the upl commit code will   */
 452                /* respect the pageout bit in the target page over the  */
 453                /* caller's page list indication */
 454                m->dump_cleaning = FALSE;
 455
 456                /*
 457                 * Account for the paging reference taken when
 458                 * m->cleaning was set on this page.
 459                 */
 460                vm_object_paging_end(shadow_object);
 461                assert((m->dirty) || (m->precious) ||
 462                                (m->busy && m->cleaning));
 463
 464                /*
 465                 * Handle the trusted pager throttle.
 466                 * Also decrement the burst throttle (if external).
 467                 */
 468                vm_page_lock_queues();
 469                if (m->laundry) {
 470                        vm_pageout_throttle_up(m);
 471                }
 472
 473                /*
 474                 * Handle the "target" page(s). These pages are to be freed if
 475                 * successfully cleaned. Target pages are always busy, and are
 476                 * wired exactly once. The initial target pages are not mapped,
 477                 * (so cannot be referenced or modified) but converted target
 478                 * pages may have been modified between the selection as an
 479                 * adjacent page and conversion to a target.
 480                 */
 481                if (m->pageout) {
 482                        assert(m->busy);
 483                        assert(m->wire_count == 1);
 484                        m->cleaning = FALSE;
 485                        m->pageout = FALSE;
 486#if MACH_CLUSTER_STATS
 487                        if (m->wanted) vm_pageout_target_collisions++;
 488#endif
 489                        /*
 490                         * Revoke all access to the page. Since the object is
 491                         * locked, and the page is busy, this prevents the page
 492                         * from being dirtied after the pmap_disconnect() call
 493                         * returns.
 494                         *
 495                         * Since the page is left "dirty" but "not modifed", we
 496                         * can detect whether the page was redirtied during
 497                         * pageout by checking the modify state.
 498                         */
 499                        if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 500                              m->dirty = TRUE;
 501                        else
 502                              m->dirty = FALSE;
 503
 504                        if (m->dirty) {
 505                                CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 506                                vm_page_unwire(m);/* reactivates */
 507                                VM_STAT(reactivations++);
 508                                PAGE_WAKEUP_DONE(m);
 509                        } else {
 510                                CLUSTER_STAT(vm_pageout_target_page_freed++;)
 511                                vm_page_free(m);/* clears busy, etc. */
 512                        }
 513                        vm_page_unlock_queues();
 514                        continue;
 515                }
 516                /*
 517                 * Handle the "adjacent" pages. These pages were cleaned in
 518                 * place, and should be left alone.
 519                 * If prep_pin_count is nonzero, then someone is using the
 520                 * page, so make it active.
 521                 */
 522                if (!m->active && !m->inactive && !m->private) {
 523                        if (m->reference)
 524                                vm_page_activate(m);
 525                        else
 526                                vm_page_deactivate(m);
 527                }
 528                if((m->busy) && (m->cleaning)) {
 529
 530                        /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 531                        m->busy = FALSE;
 532
 533                        /* We do not re-set m->dirty ! */
 534                        /* The page was busy so no extraneous activity     */
 535                        /* could have occurred. COPY_INTO is a read into the */
 536                        /* new pages. CLEAN_IN_PLACE does actually write   */
 537                        /* out the pages but handling outside of this code */
 538                        /* will take care of resetting dirty. We clear the */
 539                        /* modify however for the Programmed I/O case.     */ 
 540                        pmap_clear_modify(m->phys_page);
 541                        if(m->absent) {
 542                                m->absent = FALSE;
 543                                if(shadow_object->absent_count == 1)
 544                                        vm_object_absent_release(shadow_object);
 545                                else
 546                                        shadow_object->absent_count--;
 547                        }
 548                        m->overwriting = FALSE;
 549                } else if (m->overwriting) {
 550                        /* alternate request page list, write to page_list */
 551                        /* case.  Occurs when the original page was wired  */
 552                        /* at the time of the list request */
 553                        assert(m->wire_count != 0);
 554                        vm_page_unwire(m);/* reactivates */
 555                        m->overwriting = FALSE;
 556                } else {
 557                /*
 558                 * Set the dirty state according to whether or not the page was
 559                 * modified during the pageout. Note that we purposefully do
 560                 * NOT call pmap_clear_modify since the page is still mapped.
 561                 * If the page were to be dirtied between the 2 calls, this
 562                 * this fact would be lost. This code is only necessary to
 563                 * maintain statistics, since the pmap module is always
 564                 * consulted if m->dirty is false.
 565                 */
 566#if MACH_CLUSTER_STATS
 567                        m->dirty = pmap_is_modified(m->phys_page);
 568
 569                        if (m->dirty)   vm_pageout_cluster_dirtied++;
 570                        else            vm_pageout_cluster_cleaned++;
 571                        if (m->wanted)  vm_pageout_cluster_collisions++;
 572#else
 573                        m->dirty = 0;
 574#endif
 575                }
 576                m->cleaning = FALSE;
 577
 578                /*
 579                 * Wakeup any thread waiting for the page to be un-cleaning.
 580                 */
 581                PAGE_WAKEUP(m);
 582                vm_page_unlock_queues();
 583        }
 584        /*
 585         * Account for the paging reference taken in vm_paging_object_allocate.
 586         */
 587        vm_object_paging_end(shadow_object);
 588        vm_object_unlock(shadow_object);
 589
 590        assert(object->ref_count == 0);
 591        assert(object->paging_in_progress == 0);
 592        assert(object->resident_page_count == 0);
 593        return;
 594}
 595
 596/*
 597 *      Routine:        vm_pageout_setup
 598 *      Purpose:
 599 *              Set up a page for pageout (clean & flush).
 600 *
 601 *              Move the page to a new object, as part of which it will be
 602 *              sent to its memory manager in a memory_object_data_write or
 603 *              memory_object_initialize message.
 604 *
 605 *              The "new_object" and "new_offset" arguments
 606 *              indicate where the page should be moved.
 607 *
 608 *      In/Out conditions:
 609 *              The page in question must not be on any pageout queues,
 610 *              and must be busy.  The object to which it belongs
 611 *              must be unlocked, and the caller must hold a paging
 612 *              reference to it.  The new_object must not be locked.
 613 *
 614 *              This routine returns a pointer to a place-holder page,
 615 *              inserted at the same offset, to block out-of-order
 616 *              requests for the page.  The place-holder page must
 617 *              be freed after the data_write or initialize message
 618 *              has been sent.
 619 *
 620 *              The original page is put on a paging queue and marked
 621 *              not busy on exit.
 622 */
 623vm_page_t
 624vm_pageout_setup(
 625        register vm_page_t      m,
 626        register vm_object_t    new_object,
 627        vm_object_offset_t      new_offset)
 628{
 629        register vm_object_t    old_object = m->object;
 630        vm_object_offset_t      paging_offset;
 631        vm_object_offset_t      offset;
 632        register vm_page_t      holding_page;
 633        register vm_page_t      new_m;
 634        boolean_t               need_to_wire = FALSE;
 635
 636
 637        XPR(XPR_VM_PAGEOUT,
 638     "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
 639                (integer_t)m->object, (integer_t)m->offset, 
 640                (integer_t)m, (integer_t)new_object, 
 641                (integer_t)new_offset);
 642        assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
 643                !m->restart);
 644
 645        assert(m->dirty || m->precious);
 646
 647        /*
 648         *      Create a place-holder page where the old one was, to prevent
 649         *      attempted pageins of this page while we're unlocked.
 650         */
 651        VM_PAGE_GRAB_FICTITIOUS(holding_page);
 652
 653        vm_object_lock(old_object);
 654
 655        offset = m->offset;
 656        paging_offset = offset + old_object->paging_offset;
 657
 658        if (old_object->pager_trusted) {
 659                /*
 660                 * This pager is trusted, so we can clean this page
 661                 * in place. Leave it in the old object, and mark it
 662                 * cleaning & pageout.
 663                 */
 664                new_m = holding_page;
 665                holding_page = VM_PAGE_NULL;
 666
 667                /*
 668                 * Set up new page to be private shadow of real page.
 669                 */
 670                new_m->phys_page = m->phys_page;
 671                new_m->fictitious = FALSE;
 672                new_m->pageout = TRUE;
 673
 674                /*
 675                 * Mark real page as cleaning (indicating that we hold a
 676                 * paging reference to be released via m_o_d_r_c) and
 677                 * pageout (indicating that the page should be freed
 678                 * when the pageout completes).
 679                 */
 680                pmap_clear_modify(m->phys_page);
 681                vm_page_lock_queues();
 682                new_m->private = TRUE;
 683                vm_page_wire(new_m);
 684                m->cleaning = TRUE;
 685                m->pageout = TRUE;
 686
 687                vm_page_wire(m);
 688                assert(m->wire_count == 1);
 689                vm_page_unlock_queues();
 690
 691                m->dirty = TRUE;
 692                m->precious = FALSE;
 693                m->page_lock = VM_PROT_NONE;
 694                m->unusual = FALSE;
 695                m->unlock_request = VM_PROT_NONE;
 696        } else {
 697                /*
 698                 * Cannot clean in place, so rip the old page out of the
 699                 * object, and stick the holding page in. Set new_m to the
 700                 * page in the new object.
 701                 */
 702                vm_page_lock_queues();
 703                VM_PAGE_QUEUES_REMOVE(m);
 704                vm_page_remove(m);
 705
 706                vm_page_insert(holding_page, old_object, offset);
 707                vm_page_unlock_queues();
 708
 709                m->dirty = TRUE;
 710                m->precious = FALSE;
 711                new_m = m;
 712                new_m->page_lock = VM_PROT_NONE;
 713                new_m->unlock_request = VM_PROT_NONE;
 714
 715                if (old_object->internal)
 716                        need_to_wire = TRUE;
 717        }
 718        /*
 719         *      Record that this page has been written out
 720         */
 721#if     MACH_PAGEMAP
 722        vm_external_state_set(old_object->existence_map, offset);
 723#endif  /* MACH_PAGEMAP */
 724
 725        vm_object_unlock(old_object);
 726
 727        vm_object_lock(new_object);
 728
 729        /*
 730         *      Put the page into the new object. If it is a not wired
 731         *      (if it's the real page) it will be activated.
 732         */
 733
 734        vm_page_lock_queues();
 735        vm_page_insert(new_m, new_object, new_offset);
 736        if (need_to_wire)
 737                vm_page_wire(new_m);
 738        else
 739                vm_page_activate(new_m);
 740        PAGE_WAKEUP_DONE(new_m);
 741        vm_page_unlock_queues();
 742
 743        vm_object_unlock(new_object);
 744
 745        /*
 746         *      Return the placeholder page to simplify cleanup.
 747         */
 748        return (holding_page);
 749}
 750
 751/*
 752 * Routine:     vm_pageclean_setup
 753 *
 754 * Purpose:     setup a page to be cleaned (made non-dirty), but not
 755 *              necessarily flushed from the VM page cache.
 756 *              This is accomplished by cleaning in place.
 757 *
 758 *              The page must not be busy, and the object and page
 759 *              queues must be locked.
 760 *              
 761 */
 762void
 763vm_pageclean_setup(
 764        vm_page_t               m,
 765        vm_page_t               new_m,
 766        vm_object_t             new_object,
 767        vm_object_offset_t      new_offset)
 768{
 769        vm_object_t old_object = m->object;
 770        assert(!m->busy);
 771        assert(!m->cleaning);
 772
 773        XPR(XPR_VM_PAGEOUT,
 774    "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 775                (integer_t)old_object, m->offset, (integer_t)m, 
 776                (integer_t)new_m, new_offset);
 777
 778        pmap_clear_modify(m->phys_page);
 779        vm_object_paging_begin(old_object);
 780
 781        /*
 782         *      Record that this page has been written out
 783         */
 784#if     MACH_PAGEMAP
 785        vm_external_state_set(old_object->existence_map, m->offset);
 786#endif  /*MACH_PAGEMAP*/
 787
 788        /*
 789         * Mark original page as cleaning in place.
 790         */
 791        m->cleaning = TRUE;
 792        m->dirty = TRUE;
 793        m->precious = FALSE;
 794
 795        /*
 796         * Convert the fictitious page to a private shadow of
 797         * the real page.
 798         */
 799        assert(new_m->fictitious);
 800        new_m->fictitious = FALSE;
 801        new_m->private = TRUE;
 802        new_m->pageout = TRUE;
 803        new_m->phys_page = m->phys_page;
 804        vm_page_wire(new_m);
 805
 806        vm_page_insert(new_m, new_object, new_offset);
 807        assert(!new_m->wanted);
 808        new_m->busy = FALSE;
 809}
 810
 811void
 812vm_pageclean_copy(
 813        vm_page_t               m,
 814        vm_page_t               new_m,
 815        vm_object_t             new_object,
 816        vm_object_offset_t      new_offset)
 817{
 818        XPR(XPR_VM_PAGEOUT,
 819        "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
 820                m, new_m, new_object, new_offset, 0);
 821
 822        assert((!m->busy) && (!m->cleaning));
 823
 824        assert(!new_m->private && !new_m->fictitious);
 825
 826        pmap_clear_modify(m->phys_page);
 827
 828        m->busy = TRUE;
 829        vm_object_paging_begin(m->object);
 830        vm_page_unlock_queues();
 831        vm_object_unlock(m->object);
 832
 833        /*
 834         * Copy the original page to the new page.
 835         */
 836        vm_page_copy(m, new_m);
 837
 838        /*
 839         * Mark the old page as clean. A request to pmap_is_modified
 840         * will get the right answer.
 841         */
 842        vm_object_lock(m->object);
 843        m->dirty = FALSE;
 844
 845        vm_object_paging_end(m->object);
 846
 847        vm_page_lock_queues();
 848        if (!m->active && !m->inactive)
 849                vm_page_activate(m);
 850        PAGE_WAKEUP_DONE(m);
 851
 852        vm_page_insert(new_m, new_object, new_offset);
 853        vm_page_activate(new_m);
 854        new_m->busy = FALSE;    /* No other thread can be waiting */
 855}
 856
 857
 858/*
 859 *      Routine:        vm_pageout_initialize_page
 860 *      Purpose:
 861 *              Causes the specified page to be initialized in
 862 *              the appropriate memory object. This routine is used to push
 863 *              pages into a copy-object when they are modified in the
 864 *              permanent object.
 865 *
 866 *              The page is moved to a temporary object and paged out.
 867 *
 868 *      In/out conditions:
 869 *              The page in question must not be on any pageout queues.
 870 *              The object to which it belongs must be locked.
 871 *              The page must be busy, but not hold a paging reference.
 872 *
 873 *      Implementation:
 874 *              Move this page to a completely new object.
 875 */
 876void    
 877vm_pageout_initialize_page(
 878        vm_page_t       m)
 879{
 880        vm_object_t             object;
 881        vm_object_offset_t      paging_offset;
 882        vm_page_t               holding_page;
 883
 884
 885        XPR(XPR_VM_PAGEOUT,
 886                "vm_pageout_initialize_page, page 0x%X\n",
 887                (integer_t)m, 0, 0, 0, 0);
 888        assert(m->busy);
 889
 890        /*
 891         *      Verify that we really want to clean this page
 892         */
 893        assert(!m->absent);
 894        assert(!m->error);
 895        assert(m->dirty);
 896
 897        /*
 898         *      Create a paging reference to let us play with the object.
 899         */
 900        object = m->object;
 901        paging_offset = m->offset + object->paging_offset;
 902        vm_object_paging_begin(object);
 903        if (m->absent || m->error || m->restart ||
 904            (!m->dirty && !m->precious)) {
 905                VM_PAGE_FREE(m);
 906                panic("reservation without pageout?"); /* alan */
 907             vm_object_unlock(object);
 908                return;
 909        }
 910
 911        /* set the page for future call to vm_fault_list_request */
 912        holding_page = NULL;
 913        vm_page_lock_queues();
 914        pmap_clear_modify(m->phys_page);
 915        m->dirty = TRUE;
 916        m->busy = TRUE;
 917        m->list_req_pending = TRUE;
 918        m->cleaning = TRUE;
 919        m->pageout = TRUE;
 920        vm_page_wire(m);
 921        vm_page_unlock_queues();
 922        vm_object_unlock(object);
 923
 924        /*
 925         *      Write the data to its pager.
 926         *      Note that the data is passed by naming the new object,
 927         *      not a virtual address; the pager interface has been
 928         *      manipulated to use the "internal memory" data type.
 929         *      [The object reference from its allocation is donated
 930         *      to the eventual recipient.]
 931         */
 932        memory_object_data_initialize(object->pager,
 933                                        paging_offset,
 934                                        PAGE_SIZE);
 935
 936        vm_object_lock(object);
 937}
 938
 939#if     MACH_CLUSTER_STATS
 940#define MAXCLUSTERPAGES 16
 941struct {
 942        unsigned long pages_in_cluster;
 943        unsigned long pages_at_higher_offsets;
 944        unsigned long pages_at_lower_offsets;
 945} cluster_stats[MAXCLUSTERPAGES];
 946#endif  /* MACH_CLUSTER_STATS */
 947
 948boolean_t allow_clustered_pageouts = FALSE;
 949
 950/*
 951 * vm_pageout_cluster:
 952 *
 953 * Given a page, queue it to the appropriate I/O thread,
 954 * which will page it out and attempt to clean adjacent pages
 955 * in the same operation.
 956 *
 957 * The page must be busy, and the object and queues locked. We will take a
 958 * paging reference to prevent deallocation or collapse when we
 959 * release the object lock back at the call site.  The I/O thread
 960 * is responsible for consuming this reference
 961 *
 962 * The page must not be on any pageout queue.
 963 */
 964
 965void
 966vm_pageout_cluster(vm_page_t m)
 967{
 968        vm_object_t     object = m->object;
 969        struct          vm_pageout_queue *q;
 970
 971
 972        XPR(XPR_VM_PAGEOUT,
 973                "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 974                (integer_t)object, m->offset, (integer_t)m, 0, 0);
 975
 976        /*
 977         * Only a certain kind of page is appreciated here.
 978         */
 979        assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
 980        assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 981
 982        /*
 983         * protect the object from collapse - 
 984         * locking in the object's paging_offset.
 985         */
 986        vm_object_paging_begin(object);
 987
 988        /*
 989         * set the page for future call to vm_fault_list_request
 990         * page should already be marked busy
 991         */
 992        vm_page_wire(m);
 993        m->list_req_pending = TRUE;
 994        m->cleaning = TRUE;
 995        m->pageout = TRUE;
 996        m->laundry = TRUE;
 997
 998        if (object->internal == TRUE)
 999                q = &vm_pageout_queue_internal;
1000        else
1001                q = &vm_pageout_queue_external;
1002        q->pgo_laundry++;
1003
1004        m->pageout_queue = TRUE;
1005        queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1006        
1007        if (q->pgo_idle == TRUE) {
1008                q->pgo_idle = FALSE;
1009                thread_wakeup((event_t) &q->pgo_pending);
1010        }
1011}
1012
1013
1014unsigned long vm_pageout_throttle_up_count = 0;
1015
1016/*
1017 * A page is back from laundry.  See if there are some pages waiting to
1018 * go to laundry and if we can let some of them go now.
1019 *
1020 * Object and page queues must be locked.
1021 */
1022void
1023vm_pageout_throttle_up(
1024        vm_page_t       m)
1025{
1026        struct vm_pageout_queue *q;
1027
1028        vm_pageout_throttle_up_count++;
1029
1030        assert(m->laundry);
1031        assert(m->object != VM_OBJECT_NULL);
1032        assert(m->object != kernel_object);
1033
1034        if (m->object->internal == TRUE)
1035                q = &vm_pageout_queue_internal;
1036        else
1037                q = &vm_pageout_queue_external;
1038
1039        m->laundry = FALSE;
1040        q->pgo_laundry--;
1041
1042        if (q->pgo_throttled == TRUE) {
1043                q->pgo_throttled = FALSE;
1044                thread_wakeup((event_t) &q->pgo_laundry);
1045        }
1046}
1047
1048
1049/*
1050 *      vm_pageout_scan does the dirty work for the pageout daemon.
1051 *      It returns with vm_page_queue_free_lock held and
1052 *      vm_page_free_wanted == 0.
1053 */
1054
1055#define DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
1056
1057#define FCS_IDLE                0
1058#define FCS_DELAYED             1
1059#define FCS_DEADLOCK_DETECTED   2
1060
1061struct flow_control {
1062        int             state;
1063        mach_timespec_t ts;
1064};
1065
1066extern kern_return_t    sysclk_gettime(mach_timespec_t *);
1067
1068
1069void
1070vm_pageout_scan(void)
1071{
1072        unsigned int loop_count = 0;
1073        unsigned int inactive_burst_count = 0;
1074        unsigned int active_burst_count = 0;
1075        vm_page_t   local_freeq = 0;
1076        int         local_freed = 0;
1077        int         delayed_unlock = 0;
1078        int         need_internal_inactive = 0;
1079        int         refmod_state = 0;
1080        int     vm_pageout_deadlock_target = 0;
1081        struct  vm_pageout_queue *iq;
1082        struct  vm_pageout_queue *eq;
1083        struct  flow_control    flow_control;
1084        boolean_t active_throttled = FALSE;
1085        boolean_t inactive_throttled = FALSE;
1086        mach_timespec_t         ts;
1087        unsigned int msecs = 0;
1088        vm_object_t     object;
1089        
1090
1091        flow_control.state = FCS_IDLE;
1092        iq = &vm_pageout_queue_internal;
1093        eq = &vm_pageout_queue_external;
1094
1095        XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1096
1097/*???*/ /*
1098         *      We want to gradually dribble pages from the active queue
1099         *      to the inactive queue.  If we let the inactive queue get
1100         *      very small, and then suddenly dump many pages into it,
1101         *      those pages won't get a sufficient chance to be referenced
1102         *      before we start taking them from the inactive queue.
1103         *
1104         *      We must limit the rate at which we send pages to the pagers.
1105         *      data_write messages consume memory, for message buffers and
1106         *      for map-copy objects.  If we get too far ahead of the pagers,
1107         *      we can potentially run out of memory.
1108         *
1109         *      We can use the laundry count to limit directly the number
1110         *      of pages outstanding to the default pager.  A similar
1111         *      strategy for external pagers doesn't work, because
1112         *      external pagers don't have to deallocate the pages sent them,
1113         *      and because we might have to send pages to external pagers
1114         *      even if they aren't processing writes.  So we also
1115         *      use a burst count to limit writes to external pagers.
1116         *
1117         *      When memory is very tight, we can't rely on external pagers to
1118         *      clean pages.  They probably aren't running, because they
1119         *      aren't vm-privileged.  If we kept sending dirty pages to them,
1120         *      we could exhaust the free list.
1121         */
1122        vm_page_lock_queues();
1123        delayed_unlock = 1;
1124
1125
1126Restart:
1127        /*
1128         *      Recalculate vm_page_inactivate_target.
1129         */
1130        vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1131                                                          vm_page_inactive_count);
1132        object = NULL;
1133
1134        for (;;) {
1135                vm_page_t m;
1136
1137                if (delayed_unlock == 0)
1138                        vm_page_lock_queues();
1139
1140                active_burst_count = vm_page_active_count;
1141
1142                if (active_burst_count > vm_pageout_burst_active_throttle)
1143                        active_burst_count = vm_pageout_burst_active_throttle;
1144
1145                /*
1146                 *      Move pages from active to inactive.
1147                 */
1148                while ((need_internal_inactive ||
1149                           vm_page_inactive_count < vm_page_inactive_target) &&
1150                       !queue_empty(&vm_page_queue_active) &&
1151                       ((active_burst_count--) > 0)) {
1152
1153                        vm_pageout_active++;
1154
1155                        m = (vm_page_t) queue_first(&vm_page_queue_active);
1156
1157                        assert(m->active && !m->inactive);
1158                        assert(!m->laundry);
1159                        assert(m->object != kernel_object);
1160
1161                        /*
1162                         * Try to lock object; since we've already got the
1163                         * page queues lock, we can only 'try' for this one.
1164                         * if the 'try' fails, we need to do a mutex_pause
1165                         * to allow the owner of the object lock a chance to
1166                         * run... otherwise, we're likely to trip over this
1167                         * object in the same state as we work our way through
1168                         * the queue... clumps of pages associated with the same
1169                         * object are fairly typical on the inactive and active queues
1170                         */
1171                        if (m->object != object) {
1172                                if (object != NULL) {
1173                                        vm_object_unlock(object);
1174                                        object = NULL;
1175                                }
1176                                if (!vm_object_lock_try(m->object)) {
1177                                        /*
1178                                         * move page to end of active queue and continue
1179                                         */
1180                                        queue_remove(&vm_page_queue_active, m,
1181                                                     vm_page_t, pageq);
1182                                        queue_enter(&vm_page_queue_active, m,
1183                                                    vm_page_t, pageq);
1184                                        
1185                                        goto done_with_activepage;
1186                                }
1187                                object = m->object;
1188                        }
1189                        /*
1190                         * if the page is BUSY, then we pull it
1191                         * off the active queue and leave it alone.
1192                         * when BUSY is cleared, it will get stuck
1193                         * back on the appropriate queue
1194                         */
1195                        if (m->busy) {
1196                                queue_remove(&vm_page_queue_active, m,
1197                                             vm_page_t, pageq);
1198                                m->pageq.next = NULL;
1199                                m->pageq.prev = NULL;
1200
1201                                if (!m->fictitious)
1202                                        vm_page_active_count--;
1203                                m->active = FALSE;
1204
1205                                goto done_with_activepage;
1206                        }
1207                        if (need_internal_inactive) {
1208                                /*
1209                                 * If we're unable to make forward progress
1210                                 * with the current set of pages on the 
1211                                 * inactive queue due to busy objects or
1212                                 * throttled pageout queues, then 
1213                                 * move a page that is already clean
1214                                 * or belongs to a pageout queue that
1215                                 * isn't currently throttled
1216                                 */
1217                                active_throttled = FALSE;
1218
1219                                if (object->internal) {
1220                                        if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1221                                                active_throttled = TRUE;
1222                                } else if (VM_PAGE_Q_THROTTLED(eq)) {
1223                                                active_throttled = TRUE;
1224                                }
1225                                if (active_throttled == TRUE) {
1226                                        if (!m->dirty) {
1227                                                refmod_state = pmap_get_refmod(m->phys_page);
1228                  
1229                                                if (refmod_state & VM_MEM_REFERENCED)
1230                                                        m->reference = TRUE;
1231                                                if (refmod_state & VM_MEM_MODIFIED)
1232                                                        m->dirty = TRUE;
1233                                        }
1234                                        if (m->dirty || m->precious) {
1235                                                /*
1236                                                 * page is dirty and targets a THROTTLED queue
1237                                                 * so all we can do is move it back to the
1238                                                 * end of the active queue to get it out
1239                                                 * of the way
1240                                                 */
1241                                                queue_remove(&vm_page_queue_active, m,
1242                                                             vm_page_t, pageq);
1243                                                queue_enter(&vm_page_queue_active, m,
1244                                                            vm_page_t, pageq);
1245
1246                                                vm_pageout_scan_active_throttled++;
1247
1248                                                goto done_with_activepage;
1249                                        }
1250                                }
1251                                vm_pageout_scan_active_throttle_success++;
1252                                need_internal_inactive--;
1253                        }
1254                        /*
1255                         *      Deactivate the page while holding the object
1256                         *      locked, so we know the page is still not busy.
1257                         *      This should prevent races between pmap_enter
1258                         *      and pmap_clear_reference.  The page might be
1259                         *      absent or fictitious, but vm_page_deactivate
1260                         *      can handle that.
1261                         */
1262                        vm_page_deactivate(m);
1263done_with_activepage:
1264                        if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1265
1266                                if (object != NULL) {
1267                                        vm_object_unlock(object);
1268                                        object = NULL;
1269                                }
1270                                if (local_freeq) {
1271                                        vm_page_free_list(local_freeq);
1272                                        
1273                                        local_freeq = 0;
1274                                        local_freed = 0;
1275                                }
1276                                delayed_unlock = 0;
1277                                vm_page_unlock_queues();
1278
1279                                mutex_pause();
1280                                vm_page_lock_queues();
1281                                /*
1282                                 * continue the while loop processing
1283                                 * the active queue... need to hold
1284                                 * the page queues lock
1285                                 */
1286                                continue;
1287                        }
1288                }
1289
1290
1291
1292                /**********************************************************************
1293                 * above this point we're playing with the active queue
1294                 * below this point we're playing with the throttling mechanisms
1295                 * and the inactive queue
1296                 **********************************************************************/
1297
1298
1299
1300                /*
1301                 *      We are done if we have met our target *and*
1302                 *      nobody is still waiting for a page.
1303                 */
1304                if (vm_page_free_count + local_freed >= vm_page_free_target) {
1305                        if (object != NULL) {
1306                                vm_object_unlock(object);
1307                                object = NULL;
1308                        }
1309                        if (local_freeq) {
1310                                vm_page_free_list(local_freeq);
1311                                        
1312                                local_freeq = 0;
1313                                local_freed = 0;
1314                        }
1315                        mutex_lock(&vm_page_queue_free_lock);
1316
1317                        if ((vm_page_free_count >= vm_page_free_target) &&
1318                                  (vm_page_free_wanted == 0)) {
1319
1320                                vm_page_unlock_queues();
1321
1322                                thread_wakeup((event_t) &vm_pageout_garbage_collect);
1323                                return;
1324                        }
1325                        mutex_unlock(&vm_page_queue_free_lock);
1326                }
1327
1328
1329                /*
1330                 * Sometimes we have to pause:
1331                 *      1) No inactive pages - nothing to do.
1332                 *      2) Flow control - default pageout queue is full
1333                 *      3) Loop control - no acceptable pages found on the inactive queue
1334                 *         within the last vm_pageout_burst_inactive_throttle iterations
1335                 */
1336                if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1337                        vm_pageout_scan_empty_throttle++;
1338                        msecs = vm_pageout_empty_wait;
1339                        goto vm_pageout_scan_delay;
1340
1341                } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1342                        vm_pageout_scan_burst_throttle++;
1343                        msecs = vm_pageout_burst_wait;
1344                        goto vm_pageout_scan_delay;
1345
1346                } else if (VM_PAGE_Q_THROTTLED(iq)) {
1347
1348                        switch (flow_control.state) {
1349
1350                        case FCS_IDLE:
1351reset_deadlock_timer:
1352                                ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1353                                ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1354                                sysclk_gettime(&flow_control.ts);
1355                                ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1356                                
1357                                flow_control.state = FCS_DELAYED;
1358                                msecs = vm_pageout_deadlock_wait;
1359
1360                                break;
1361                                        
1362                        case FCS_DELAYED:
1363                                sysclk_gettime(&ts);
1364
1365                                if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1366                                        /*
1367                                         * the pageout thread for the default pager is potentially
1368                                         * deadlocked since the 
1369                                         * default pager queue has been throttled for more than the
1370                                         * allowable time... we need to move some clean pages or dirty
1371                                         * pages belonging to the external pagers if they aren't throttled
1372                                         * vm_page_free_wanted represents the number of threads currently
1373                                         * blocked waiting for pages... we'll move one page for each of
1374                                         * these plus a fixed amount to break the logjam... once we're done
1375                                         * moving this number of pages, we'll re-enter the FSC_DELAYED state
1376                                         * with a new timeout target since we have no way of knowing 
1377                                         * whether we've broken the deadlock except through observation
1378                                         * of the queue associated with the default pager... we need to
1379                                         * stop moving pagings and allow the system to run to see what
1380                                         * state it settles into.
1381                                         */
1382                                        vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1383                                        vm_pageout_scan_deadlock_detected++;
1384                                        flow_control.state = FCS_DEADLOCK_DETECTED;
1385
1386                                        thread_wakeup((event_t) &vm_pageout_garbage_collect);
1387                                        goto consider_inactive;
1388                                }
1389                                /*
1390                                 * just resniff instead of trying
1391                                 * to compute a new delay time... we're going to be
1392                                 * awakened immediately upon a laundry completion,
1393                                 * so we won't wait any longer than necessary
1394                                 */
1395                                msecs = vm_pageout_idle_wait;
1396                                break;
1397
1398                        case FCS_DEADLOCK_DETECTED:
1399                                if (vm_pageout_deadlock_target)
1400                                        goto consider_inactive;
1401                                goto reset_deadlock_timer;
1402
1403                        }
1404                        vm_pageout_scan_throttle++;
1405                        iq->pgo_throttled = TRUE;
1406vm_pageout_scan_delay:
1407                        if (object != NULL) {
1408                                vm_object_unlock(object);
1409                                object = NULL;
1410                        }
1411                        if (local_freeq) {
1412                                vm_page_free_list(local_freeq);
1413                                        
1414                                local_freeq = 0;
1415                                local_freed = 0;
1416                        }
1417                        assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1418
1419                        counter(c_vm_pageout_scan_block++);
1420
1421                        vm_page_unlock_queues();
1422                                
1423                        thread_block(THREAD_CONTINUE_NULL);
1424
1425                        vm_page_lock_queues();
1426                        delayed_unlock = 1;
1427
1428                        iq->pgo_throttled = FALSE;
1429
1430                        if (loop_count >= vm_page_inactive_count) {
1431                                if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1432                                        /*
1433                                         * Make sure we move enough "appropriate"
1434                                         * pages to the inactive queue before trying
1435                                         * again.
1436                                         */
1437                                        need_internal_inactive = vm_pageout_inactive_relief;
1438                                }
1439                                loop_count = 0;
1440                        }
1441                        inactive_burst_count = 0;
1442
1443                        goto Restart;
1444                        /*NOTREACHED*/
1445                }
1446
1447
1448                flow_control.state = FCS_IDLE;
1449consider_inactive:
1450                loop_count++;
1451                inactive_burst_count++;
1452                vm_pageout_inactive++;
1453
1454                if (!queue_empty(&vm_page_queue_inactive)) {
1455                        m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1456                        
1457                        if (m->clustered && (m->no_isync == TRUE)) {
1458                                goto use_this_page;
1459                        }
1460                }
1461                if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1462                        vm_zf_iterator = 0;
1463                } else {
1464                        last_page_zf = 0;
1465                        if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1466                                        vm_zf_iterator = 0;
1467                        }
1468                }
1469                if (queue_empty(&vm_page_queue_zf) ||
1470                                (((last_page_zf) || (vm_zf_iterator == 0)) &&
1471                                !queue_empty(&vm_page_queue_inactive))) {
1472                        m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1473                        last_page_zf = 0;
1474                } else {
1475                        m = (vm_page_t) queue_first(&vm_page_queue_zf);
1476                        last_page_zf = 1;
1477                }
1478use_this_page:
1479                assert(!m->active && m->inactive);
1480                assert(!m->laundry);
1481                assert(m->object != kernel_object);
1482
1483                /*
1484                 * Try to lock object; since we've alread got the
1485                 * page queues lock, we can only 'try' for this one.
1486                 * if the 'try' fails, we need to do a mutex_pause
1487                 * to allow the owner of the object lock a chance to
1488                 * run... otherwise, we're likely to trip over this
1489                 * object in the same state as we work our way through
1490                 * the queue... clumps of pages associated with the same
1491                 * object are fairly typical on the inactive and active queues
1492                 */
1493                if (m->object != object) {
1494                        if (object != NULL) {
1495                                vm_object_unlock(object);
1496                                object = NULL;
1497                        }
1498                        if (!vm_object_lock_try(m->object)) {
1499                                /*
1500                                 *      Move page to end and continue.
1501                                 *      Don't re-issue ticket
1502                                 */
1503                                if (m->zero_fill) {
1504                                        queue_remove(&vm_page_queue_zf, m,
1505                                                     vm_page_t, pageq);
1506                                        queue_enter(&vm_page_queue_zf, m,
1507                                                    vm_page_t, pageq);
1508                                } else {
1509                                        queue_remove(&vm_page_queue_inactive, m,
1510                                                     vm_page_t, pageq);
1511                                        queue_enter(&vm_page_queue_inactive, m,
1512                                                    vm_page_t, pageq);
1513                                }
1514                                vm_pageout_inactive_nolock++;
1515
1516                                /*
1517                                 * force us to dump any collected free pages
1518                                 * and to pause before moving on
1519                                 */
1520                                delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1521
1522                                goto done_with_inactivepage;
1523                        }
1524                        object = m->object;
1525                }
1526                /*
1527                 * If the page belongs to a purgable object with no pending copies
1528                 * against it, then we reap all of the pages in the object
1529                 * and note that the object has been "emptied".  It'll be up to the
1530                 * application the discover this and recreate its contents if desired.
1531                 */
1532                if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1533                     object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1534                    object->copy == VM_OBJECT_NULL) {
1535
1536                        (void) vm_object_purge(object);
1537                        vm_pageout_purged_objects++;
1538                        /*
1539                         * we've just taken all of the pages from this object,
1540                         * so drop the lock now since we're not going to find
1541                         * any more pages belonging to it anytime soon
1542                         */
1543                        vm_object_unlock(object);
1544                        object = NULL;
1545
1546                        inactive_burst_count = 0;
1547
1548                        goto done_with_inactivepage;
1549                }
1550
1551                /*
1552                 *      Paging out pages of external objects which
1553                 *      are currently being created must be avoided.
1554                 *      The pager may claim for memory, thus leading to a
1555                 *      possible dead lock between it and the pageout thread,
1556                 *      if such pages are finally chosen. The remaining assumption
1557                 *      is that there will finally be enough available pages in the
1558                 *      inactive pool to page out in order to satisfy all memory
1559                 *      claimed by the thread which concurrently creates the pager.
1560                 */
1561                if (!object->pager_initialized && object->pager_created) {
1562                        /*
1563                         *      Move page to end and continue, hoping that
1564                         *      there will be enough other inactive pages to
1565                         *      page out so that the thread which currently
1566                         *      initializes the pager will succeed.
1567                         *      Don't re-grant the ticket, the page should
1568                         *      pulled from the queue and paged out whenever
1569                         *      one of its logically adjacent fellows is
1570                         *      targeted.
1571                         */
1572                        if (m->zero_fill) {
1573                                queue_remove(&vm_page_queue_zf, m,
1574                                             vm_page_t, pageq);
1575                                queue_enter(&vm_page_queue_zf, m,
1576                                            vm_page_t, pageq);
1577                                last_page_zf = 1;
1578                                vm_zf_iterator = vm_zf_iterator_count - 1;
1579                        } else {
1580                                queue_remove(&vm_page_queue_inactive, m,
1581                                             vm_page_t, pageq);
1582                                queue_enter(&vm_page_queue_inactive, m,
1583                                            vm_page_t, pageq);
1584                                last_page_zf = 0;
1585                                vm_zf_iterator = 1;
1586                        }
1587                        vm_pageout_inactive_avoid++;
1588
1589                        goto done_with_inactivepage;
1590                }
1591                /*
1592                 *      Remove the page from the inactive list.
1593                 */
1594                if (m->zero_fill) {
1595                        queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1596                } else {
1597                        queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1598                }
1599                m->pageq.next = NULL;
1600                m->pageq.prev = NULL;
1601                m->inactive = FALSE;
1602                if (!m->fictitious)
1603                        vm_page_inactive_count--;
1604
1605                if (m->busy || !object->alive) {
1606                        /*
1607                         *      Somebody is already playing with this page.
1608                         *      Leave it off the pageout queues.
1609                         */
1610                        vm_pageout_inactive_busy++;
1611
1612                        goto done_with_inactivepage;
1613                }
1614
1615                /*
1616                 *      If it's absent or in error, we can reclaim the page.
1617                 */
1618
1619                if (m->absent || m->error) {
1620                        vm_pageout_inactive_absent++;
1621reclaim_page:
1622                        if (vm_pageout_deadlock_target) {
1623                                vm_pageout_scan_inactive_throttle_success++;
1624                                vm_pageout_deadlock_target--;
1625                        }
1626                        if (m->tabled)
1627                                vm_page_remove(m);    /* clears tabled, object, offset */
1628                        if (m->absent)
1629                                vm_object_absent_release(object);
1630
1631                        assert(m->pageq.next == NULL &&
1632                               m->pageq.prev == NULL);
1633                        m->pageq.next = (queue_entry_t)local_freeq;
1634                        local_freeq = m;
1635                        local_freed++;
1636
1637                        inactive_burst_count = 0;
1638
1639                        goto done_with_inactivepage;
1640                }
1641
1642                assert(!m->private);
1643                assert(!m->fictitious);
1644
1645                /*
1646                 *      If already cleaning this page in place, convert from
1647                 *      "adjacent" to "target". We can leave the page mapped,
1648                 *      and vm_pageout_object_terminate will determine whether
1649                 *      to free or reactivate.
1650                 */
1651
1652                if (m->cleaning) {
1653                        m->busy = TRUE;
1654                        m->pageout = TRUE;
1655                        m->dump_cleaning = TRUE;
1656                        vm_page_wire(m);
1657
1658                        CLUSTER_STAT(vm_pageout_cluster_conversions++);
1659
1660                        inactive_burst_count = 0;
1661
1662                        goto done_with_inactivepage;
1663                }
1664
1665                /*
1666                 *      If it's being used, reactivate.
1667                 *      (Fictitious pages are either busy or absent.)
1668                 */
1669                if ( (!m->reference) ) {
1670                        refmod_state = pmap_get_refmod(m->phys_page);
1671                  
1672                        if (refmod_state & VM_MEM_REFERENCED)
1673                                m->reference = TRUE;
1674                        if (refmod_state & VM_MEM_MODIFIED)
1675                                m->dirty = TRUE;
1676                }
1677                if (m->reference) {
1678was_referenced:
1679                        vm_page_activate(m);
1680                        VM_STAT(reactivations++);
1681
1682                        vm_pageout_inactive_used++;
1683                        last_page_zf = 0;
1684                        inactive_burst_count = 0;
1685
1686                        goto done_with_inactivepage;
1687                }
1688
1689                XPR(XPR_VM_PAGEOUT,
1690                "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1691                (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1692
1693                /*
1694                 * we've got a candidate page to steal...
1695                 *
1696                 * m->dirty is up to date courtesy of the
1697                 * preceding check for m->reference... if 
1698                 * we get here, then m->reference had to be
1699                 * FALSE which means we did a pmap_get_refmod
1700                 * and updated both m->reference and m->dirty
1701                 *
1702                 * if it's dirty or precious we need to
1703                 * see if the target queue is throtttled
1704                 * it if is, we need to skip over it by moving it back
1705                 * to the end of the inactive queue
1706                 */
1707                inactive_throttled = FALSE;
1708
1709                if (m->dirty || m->precious) {
1710                        if (object->internal) {
1711                                if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1712                                        inactive_throttled = TRUE;
1713                        } else if (VM_PAGE_Q_THROTTLED(eq)) {
1714                                        inactive_throttled = TRUE;
1715                        }
1716                }
1717                if (inactive_throttled == TRUE) {
1718                        if (m->zero_fill) {
1719                                queue_enter(&vm_page_queue_zf, m,
1720                                            vm_page_t, pageq);
1721                        } else {
1722                                queue_enter(&vm_page_queue_inactive, m,
1723                                            vm_page_t, pageq);
1724                        }
1725                        if (!m->fictitious)
1726                                vm_page_inactive_count++;
1727                        m->inactive = TRUE;
1728
1729                        vm_pageout_scan_inactive_throttled++;
1730
1731                        goto done_with_inactivepage;
1732                }
1733                /*
1734                 * we've got a page that we can steal...
1735                 * eliminate all mappings and make sure
1736                 * we have the up-to-date modified state
1737                 * first take the page BUSY, so that no new
1738                 * mappings can be made
1739                 */
1740                m->busy = TRUE;
1741                
1742                /*
1743                 * if we need to do a pmap_disconnect then we
1744                 * need to re-evaluate m->dirty since the pmap_disconnect
1745                 * provides the true state atomically... the 
1746                 * page was still mapped up to the pmap_disconnect
1747                 * and may have been dirtied at the last microsecond
1748                 *
1749                 * we also check for the page being referenced 'late'
1750                 * if it was, we first need to do a WAKEUP_DONE on it
1751                 * since we already set m->busy = TRUE, before 
1752                 * going off to reactivate it
1753                 *
1754                 * if we don't need the pmap_disconnect, then
1755                 * m->dirty is up to date courtesy of the
1756                 * earlier check for m->reference... if 
1757                 * we get here, then m->reference had to be
1758                 * FALSE which means we did a pmap_get_refmod
1759                 * and updated both m->reference and m->dirty...
1760                 */
1761                if (m->no_isync == FALSE) {
1762                        refmod_state = pmap_disconnect(m->phys_page);
1763
1764                        if (refmod_state & VM_MEM_MODIFIED)
1765                                m->dirty = TRUE;
1766                        if (refmod_state & VM_MEM_REFERENCED) {
1767                                m->reference = TRUE;
1768
1769                                PAGE_WAKEUP_DONE(m);
1770                                goto was_referenced;
1771                        }
1772                }
1773                /*
1774                 *      If it's clean and not precious, we can free the page.
1775                 */
1776                if (!m->dirty && !m->precious) {
1777                        vm_pageout_inactive_clean++;
1778                        goto reclaim_page;
1779                }
1780                vm_pageout_cluster(m);
1781
1782                vm_pageout_inactive_dirty++;
1783
1784                inactive_burst_count = 0;
1785
1786done_with_inactivepage:
1787                if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1788
1789                        if (object != NULL) {
1790                                vm_object_unlock(object);
1791                                object = NULL;
1792                        }
1793                        if (local_freeq) {
1794                                vm_page_free_list(local_freeq);
1795                                
1796                                local_freeq = 0;
1797                                local_freed = 0;
1798                        }
1799                        delayed_unlock = 0;
1800                        vm_page_unlock_queues();
1801                        mutex_pause();
1802                }
1803                /*
1804                 * back to top of pageout scan loop
1805                 */
1806        }
1807}
1808
1809
1810int vm_page_free_count_init;
1811
1812void
1813vm_page_free_reserve(
1814        int pages)
1815{
1816        int             free_after_reserve;
1817
1818        vm_page_free_reserved += pages;
1819
1820        free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1821
1822        vm_page_free_min = vm_page_free_reserved +
1823                VM_PAGE_FREE_MIN(free_after_reserve);
1824
1825        vm_page_free_target = vm_page_free_reserved +
1826                VM_PAGE_FREE_TARGET(free_after_reserve);
1827
1828        if (vm_page_free_target < vm_page_free_min + 5)
1829                vm_page_free_target = vm_page_free_min + 5;
1830}
1831
1832/*
1833 *      vm_pageout is the high level pageout daemon.
1834 */
1835
1836void
1837vm_pageout_continue(void)
1838{
1839        vm_pageout_scan_event_counter++;
1840        vm_pageout_scan();
1841        /* we hold vm_page_queue_free_lock now */
1842        assert(vm_page_free_wanted == 0);
1843        assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1844        mutex_unlock(&vm_page_queue_free_lock);
1845
1846        counter(c_vm_pageout_block++);
1847        thread_block((thread_continue_t)vm_pageout_continue);
1848        /*NOTREACHED*/
1849}
1850
1851
1852/*
1853 * must be called with the
1854 * queues and object locks held
1855 */
1856static void
1857vm_pageout_queue_steal(vm_page_t m)
1858{
1859        struct vm_pageout_queue *q;
1860
1861        if (m->object->internal == TRUE)
1862                q = &vm_pageout_queue_internal;
1863        else
1864                q = &vm_pageout_queue_external;
1865
1866        m->laundry = FALSE;
1867        m->pageout_queue = FALSE;
1868        queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1869
1870        m->pageq.next = NULL;
1871        m->pageq.prev = NULL;
1872
1873        vm_object_paging_end(m->object);
1874
1875        q->pgo_laundry--;
1876}
1877
1878
1879#ifdef FAKE_DEADLOCK
1880
1881#define FAKE_COUNT      5000
1882
1883int internal_count = 0;
1884int fake_deadlock = 0;
1885
1886#endif
1887
1888static void
1889vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1890{
1891        vm_page_t       m = NULL;
1892        vm_object_t     object;
1893        boolean_t       need_wakeup;
1894
1895        vm_page_lock_queues();
1896
1897        while ( !queue_empty(&q->pgo_pending) ) {
1898
1899                   q->pgo_busy = TRUE;
1900                   queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1901                   m->pageout_queue = FALSE;
1902                   vm_page_unlock_queues();
1903
1904                   m->pageq.next = NULL;
1905                   m->pageq.prev = NULL;
1906#ifdef FAKE_DEADLOCK
1907                   if (q == &vm_pageout_queue_internal) {
1908                           vm_offset_t addr;
1909                           int  pg_count;
1910
1911                           internal_count++;
1912
1913                           if ((internal_count == FAKE_COUNT)) {
1914
1915                                   pg_count = vm_page_free_count + vm_page_free_reserved;
1916
1917                                   if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1918                                           kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1919                                   }
1920                                   internal_count = 0;
1921                                   fake_deadlock++;
1922                           }
1923                   }
1924#endif
1925                   object = m->object;
1926
1927                   if (!object->pager_initialized) {
1928                           vm_object_lock(object);
1929
1930                           /*
1931                            *   If there is no memory object for the page, create
1932                            *   one and hand it to the default pager.
1933                            */
1934
1935                           if (!object->pager_initialized)
1936                                   vm_object_collapse(object, (vm_object_offset_t)0);
1937                           if (!object->pager_initialized)
1938                                   vm_object_pager_create(object);
1939                           if (!object->pager_initialized) {
1940                                   /*
1941                                    *   Still no pager for the object.
1942                                    *   Reactivate the page.
1943                                    *
1944                                    *   Should only happen if there is no
1945                                    *   default pager.
1946                                    */
1947                                   m->list_req_pending = FALSE;
1948                                   m->cleaning = FALSE;
1949                                   m->pageout = FALSE;
1950                                   vm_page_unwire(m);
1951
1952                                   vm_pageout_throttle_up(m);
1953
1954                                   vm_page_lock_queues();
1955                                   vm_pageout_dirty_no_pager++;
1956                                   vm_page_activate(m);
1957                                   vm_page_unlock_queues();
1958
1959                                   /*
1960                                    *   And we are done with it.
1961                                    */
1962                                   PAGE_WAKEUP_DONE(m);
1963
1964                                   vm_object_paging_end(object);
1965                                   vm_object_unlock(object);
1966
1967                                   vm_page_lock_queues();
1968                                   continue;
1969                           } else if (object->pager == MEMORY_OBJECT_NULL) {
1970                                   /*
1971                                    * This pager has been destroyed by either
1972                                    * memory_object_destroy or vm_object_destroy, and
1973                                    * so there is nowhere for the page to go.
1974                                    * Just free the page... VM_PAGE_FREE takes
1975                                    * care of cleaning up all the state...
1976                                    * including doing the vm_pageout_throttle_up
1977                                    */
1978                                   VM_PAGE_FREE(m);
1979
1980                                   vm_object_paging_end(object);
1981                                   vm_object_unlock(object);
1982
1983                                   vm_page_lock_queues();
1984                                   continue;
1985                           }
1986                           vm_object_unlock(object);
1987                   }
1988                   /*
1989                    * we expect the paging_in_progress reference to have
1990                    * already been taken on the object before it was added
1991                    * to the appropriate pageout I/O queue... this will
1992                    * keep the object from being terminated and/or the 
1993                    * paging_offset from changing until the I/O has 
1994                    * completed... therefore no need to lock the object to
1995                    * pull the paging_offset from it.
1996                    *
1997                    * Send the data to the pager.
1998                    * any pageout clustering happens there
1999                    */
2000                   memory_object_data_return(object->pager,
2001                                             m->offset + object->paging_offset,
2002                                             PAGE_SIZE,
2003                                             NULL,
2004                                             NULL,
2005                                             FALSE,
2006                                             FALSE,
2007                                             0);
2008
2009                   vm_object_lock(object);
2010                   vm_object_paging_end(object);
2011                   vm_object_unlock(object);
2012
2013                   vm_page_lock_queues();
2014        }
2015        assert_wait((event_t) q, THREAD_UNINT);
2016
2017
2018        if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2019                q->pgo_throttled = FALSE;
2020                need_wakeup = TRUE;
2021        } else
2022                need_wakeup = FALSE;
2023
2024        q->pgo_busy = FALSE;
2025        q->pgo_idle = TRUE;
2026        vm_page_unlock_queues();
2027
2028        if (need_wakeup == TRUE)
2029                thread_wakeup((event_t) &q->pgo_laundry);
2030
2031        thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2032        /*NOTREACHED*/
2033}
2034
2035
2036static void
2037vm_pageout_iothread_external(void)
2038{
2039
2040        vm_pageout_iothread_continue(&vm_pageout_queue_external);
2041        /*NOTREACHED*/
2042}
2043
2044
2045static void
2046vm_pageout_iothread_internal(void)
2047{
2048        thread_t        self = current_thread();
2049
2050        self->options |= TH_OPT_VMPRIV;
2051
2052        vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2053        /*NOTREACHED*/
2054}
2055
2056static void
2057vm_pageout_garbage_collect(int collect)
2058{
2059        if (collect) {
2060                stack_collect();
2061
2062                /*
2063                 * consider_zone_gc should be last, because the other operations
2064                 * might return memory to zones.
2065                 */
2066                consider_machine_collect();
2067                consider_zone_gc();
2068
2069                consider_machine_adjust();
2070        }
2071
2072        assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2073
2074        thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2075        /*NOTREACHED*/
2076}
2077
2078
2079
2080void
2081vm_pageout(void)
2082{
2083        thread_t        self = current_thread();
2084        thread_t        thread;
2085        kern_return_t   result;
2086        spl_t           s;
2087
2088        /*
2089         * Set thread privileges.
2090         */
2091        s = splsched();
2092        thread_lock(self);
2093        self->priority = BASEPRI_PREEMPT - 1;
2094        set_sched_pri(self, self->priority);
2095        thread_unlock(self);
2096        splx(s);
2097
2098        /*
2099         *      Initialize some paging parameters.
2100         */
2101
2102        if (vm_pageout_idle_wait == 0)
2103                vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2104
2105        if (vm_pageout_burst_wait == 0)
2106                vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2107
2108        if (vm_pageout_empty_wait == 0)
2109                vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2110
2111        if (vm_pageout_deadlock_wait == 0)
2112                vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2113
2114        if (vm_pageout_deadlock_relief == 0)
2115                vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2116
2117        if (vm_pageout_inactive_relief == 0)
2118                vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2119
2120        if (vm_pageout_burst_active_throttle == 0)
2121                vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2122
2123        if (vm_pageout_burst_inactive_throttle == 0)
2124                vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2125
2126        /*
2127         * Set kernel task to low backing store privileged 
2128         * status
2129         */
2130        task_lock(kernel_task);
2131        kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2132        task_unlock(kernel_task);
2133
2134        vm_page_free_count_init = vm_page_free_count;
2135        vm_zf_iterator = 0;
2136        /*
2137         * even if we've already called vm_page_free_reserve
2138         * call it again here to insure that the targets are
2139         * accurately calculated (it uses vm_page_free_count_init)
2140         * calling it with an arg of 0 will not change the reserve
2141         * but will re-calculate free_min and free_target
2142         */
2143        if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2144                vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2145        } else
2146                vm_page_free_reserve(0);
2147
2148
2149        queue_init(&vm_pageout_queue_external.pgo_pending);
2150        vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2151        vm_pageout_queue_external.pgo_laundry = 0;
2152        vm_pageout_queue_external.pgo_idle = FALSE;
2153        vm_pageout_queue_external.pgo_busy = FALSE;
2154        vm_pageout_queue_external.pgo_throttled = FALSE;
2155
2156        queue_init(&vm_pageout_queue_internal.pgo_pending);
2157        vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2158        vm_pageout_queue_internal.pgo_laundry = 0;
2159        vm_pageout_queue_internal.pgo_idle = FALSE;
2160        vm_pageout_queue_internal.pgo_busy = FALSE;
2161        vm_pageout_queue_internal.pgo_throttled = FALSE;
2162
2163
2164        result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2165        if (result != KERN_SUCCESS)
2166                panic("vm_pageout_iothread_internal: create failed");
2167
2168        thread_deallocate(thread);
2169
2170
2171        result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2172        if (result != KERN_SUCCESS)
2173                panic("vm_pageout_iothread_external: create failed");
2174
2175        thread_deallocate(thread);
2176
2177
2178        result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2179        if (result != KERN_SUCCESS)
2180                panic("vm_pageout_garbage_collect: create failed");
2181
2182        thread_deallocate(thread);
2183
2184
2185        vm_pageout_continue();
2186        /*NOTREACHED*/
2187}
2188
2189
2190static upl_t
2191upl_create(
2192        int                flags,
2193        upl_size_t       size)
2194{
2195        upl_t   upl;
2196        int     page_field_size;  /* bit field in word size buf */
2197
2198        page_field_size = 0;
2199        if (flags & UPL_CREATE_LITE) {
2200                page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2201                page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2202        }
2203        if(flags & UPL_CREATE_INTERNAL) {
2204                upl = (upl_t)kalloc(sizeof(struct upl)
2205                        + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2206                        + page_field_size);
2207        } else {
2208                upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2209        }
2210        upl->flags = 0;
2211        upl->src_object = NULL;
2212        upl->kaddr = (vm_offset_t)0;
2213        upl->size = 0;
2214        upl->map_object = NULL;
2215        upl->ref_count = 1;
2216        upl_lock_init(upl);
2217#ifdef UPL_DEBUG
2218        upl->ubc_alias1 = 0;
2219        upl->ubc_alias2 = 0;
2220#endif /* UPL_DEBUG */
2221        return(upl);
2222}
2223
2224static void
2225upl_destroy(
2226        upl_t   upl)
2227{
2228        int     page_field_size;  /* bit field in word size buf */
2229
2230#ifdef UPL_DEBUG
2231        {
2232                upl_t   upl_ele;
2233                vm_object_t     object;
2234                if (upl->map_object->pageout) {
2235                        object = upl->map_object->shadow;
2236                } else {
2237                        object = upl->map_object;
2238                }
2239                vm_object_lock(object);
2240                queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2241                        if(upl_ele == upl) {
2242                                queue_remove(&object->uplq, 
2243                                                upl_ele, upl_t, uplq);
2244                                break;
2245                        }
2246                }
2247                vm_object_unlock(object);
2248        }
2249#endif /* UPL_DEBUG */
2250        /* drop a reference on the map_object whether or */
2251        /* not a pageout object is inserted */
2252        if(upl->map_object->pageout)
2253                vm_object_deallocate(upl->map_object);
2254
2255        page_field_size = 0;
2256        if (upl->flags & UPL_LITE) {
2257                page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2258                page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2259        }
2260        if(upl->flags & UPL_INTERNAL) {
2261                kfree(upl,
2262                      sizeof(struct upl) + 
2263                      (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2264                      + page_field_size);
2265        } else {
2266                kfree(upl, sizeof(struct upl) + page_field_size);
2267        }
2268}
2269
2270void uc_upl_dealloc(upl_t upl);
2271__private_extern__ void
2272uc_upl_dealloc(
2273        upl_t   upl)
2274{
2275        upl->ref_count -= 1;
2276        if(upl->ref_count == 0) {
2277                upl_destroy(upl);
2278        }
2279}
2280
2281void
2282upl_deallocate(
2283        upl_t   upl)
2284{
2285        
2286        upl->ref_count -= 1;
2287        if(upl->ref_count == 0) {
2288                upl_destroy(upl);
2289        }
2290}
2291
2292/*
2293 * Statistics about UPL enforcement of copy-on-write obligations.
2294 */
2295unsigned long upl_cow = 0;
2296unsigned long upl_cow_again = 0;
2297unsigned long upl_cow_contiguous = 0;
2298unsigned long upl_cow_pages = 0;
2299unsigned long upl_cow_again_pages = 0;
2300unsigned long upl_cow_contiguous_pages = 0;
2301
2302/*  
2303 *      Routine:        vm_object_upl_request 
2304 *      Purpose:        
2305 *              Cause the population of a portion of a vm_object.
2306 *              Depending on the nature of the request, the pages
2307 *              returned may be contain valid data or be uninitialized.
2308 *              A page list structure, listing the physical pages
2309 *              will be returned upon request.
2310 *              This function is called by the file system or any other
2311 *              supplier of backing store to a pager.
2312 *              IMPORTANT NOTE: The caller must still respect the relationship
2313 *              between the vm_object and its backing memory object.  The
2314 *              caller MUST NOT substitute changes in the backing file
2315 *              without first doing a memory_object_lock_request on the 
2316 *              target range unless it is know that the pages are not
2317 *              shared with another entity at the pager level.
2318 *              Copy_in_to:
2319 *                      if a page list structure is present
2320 *                      return the mapped physical pages, where a
2321 *                      page is not present, return a non-initialized
2322 *                      one.  If the no_sync bit is turned on, don't
2323 *                      call the pager unlock to synchronize with other
2324 *                      possible copies of the page. Leave pages busy
2325 *                      in the original object, if a page list structure
2326 *                      was specified.  When a commit of the page list
2327 *                      pages is done, the dirty bit will be set for each one.
2328 *              Copy_out_from:
2329 *                      If a page list structure is present, return
2330 *                      all mapped pages.  Where a page does not exist
2331 *                      map a zero filled one. Leave pages busy in
2332 *                      the original object.  If a page list structure
2333 *                      is not specified, this call is a no-op. 
2334 *
2335 *              Note:  access of default pager objects has a rather interesting
2336 *              twist.  The caller of this routine, presumably the file system
2337 *              page cache handling code, will never actually make a request
2338 *              against a default pager backed object.  Only the default
2339 *              pager will make requests on backing store related vm_objects
2340 *              In this way the default pager can maintain the relationship
2341 *              between backing store files (abstract memory objects) and 
2342 *              the vm_objects (cache objects), they support.
2343 *
2344 */
2345
2346__private_extern__ kern_return_t
2347vm_object_upl_request(
2348        vm_object_t             object,
2349        vm_object_offset_t      offset,
2350        upl_size_t              size,
2351        upl_t                   *upl_ptr,
2352        upl_page_info_array_t   user_page_list,
2353        unsigned int            *page_list_count,
2354        int                     cntrl_flags)
2355{
2356        vm_page_t               dst_page = VM_PAGE_NULL;
2357        vm_object_offset_t      dst_offset = offset;
2358        upl_size_t              xfer_size = size;
2359        boolean_t               do_m_lock = FALSE;
2360        boolean_t               dirty;
2361        boolean_t               hw_dirty;
2362        upl_t                   upl = NULL;
2363        unsigned int            entry;
2364#if MACH_CLUSTER_STATS
2365        boolean_t               encountered_lrp = FALSE;
2366#endif
2367        vm_page_t               alias_page = NULL;
2368        int                     page_ticket; 
2369        int                     refmod_state;
2370        wpl_array_t             lite_list = NULL;
2371        vm_object_t             last_copy_object;
2372
2373
2374        if (cntrl_flags & ~UPL_VALID_FLAGS) {
2375                /*
2376                 * For forward compatibility's sake,
2377                 * reject any unknown flag.
2378                 */
2379                return KERN_INVALID_VALUE;
2380        }
2381
2382        page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2383                                        >> UPL_PAGE_TICKET_SHIFT;
2384
2385        if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2386                size = MAX_UPL_TRANSFER * PAGE_SIZE;
2387        }
2388
2389        if(cntrl_flags & UPL_SET_INTERNAL)
2390                if(page_list_count != NULL)
2391                        *page_list_count = MAX_UPL_TRANSFER;
2392
2393        if((!object->internal) && (object->paging_offset != 0))
2394                panic("vm_object_upl_request: vnode object with non-zero paging offset\n");
2395
2396        if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2397                return KERN_SUCCESS;
2398        }
2399
2400        vm_object_lock(object);
2401        vm_object_paging_begin(object);
2402        vm_object_unlock(object);
2403
2404        if(upl_ptr) {
2405                if(cntrl_flags & UPL_SET_INTERNAL) {
2406                        if(cntrl_flags & UPL_SET_LITE) {
2407                                uintptr_t page_field_size;
2408                                upl = upl_create(
2409                                        UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2410                                        size);
2411                                user_page_list = (upl_page_info_t *)
2412                                   (((uintptr_t)upl) + sizeof(struct upl));
2413                                lite_list = (wpl_array_t)
2414                                        (((uintptr_t)user_page_list) + 
2415                                        ((size/PAGE_SIZE) * 
2416                                                sizeof(upl_page_info_t)));
2417                                page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2418                                page_field_size = 
2419                                        (page_field_size + 3) & 0xFFFFFFFC;
2420                                bzero((char *)lite_list, page_field_size);
2421                                upl->flags = 
2422                                        UPL_LITE | UPL_INTERNAL;
2423                        } else {
2424                                upl = upl_create(UPL_CREATE_INTERNAL, size);
2425                                user_page_list = (upl_page_info_t *)
2426                                        (((uintptr_t)upl) + sizeof(struct upl));
2427                                upl->flags = UPL_INTERNAL;
2428                        }
2429                } else {
2430                        if(cntrl_flags & UPL_SET_LITE) {
2431                                uintptr_t page_field_size;
2432                                upl = upl_create(UPL_CREATE_LITE, size);
2433                                lite_list = (wpl_array_t)
2434                                   (((uintptr_t)upl) + sizeof(struct upl));
2435                                page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2436                                page_field_size = 
2437                                        (page_field_size + 3) & 0xFFFFFFFC;
2438                                bzero((char *)lite_list, page_field_size);
2439                                upl->flags = UPL_LITE;
2440                        } else {
2441                                upl = upl_create(UPL_CREATE_EXTERNAL, size);
2442                                upl->flags = 0;
2443                        }
2444                }
2445
2446                if (object->phys_contiguous) {
2447                        if ((cntrl_flags & UPL_WILL_MODIFY) &&
2448                            object->copy != VM_OBJECT_NULL) {
2449                                /* Honor copy-on-write obligations */
2450
2451                                /*
2452                                 * XXX FBDP
2453                                 * We could still have a race...
2454                                 * A is here building the UPL for a write().
2455                                 * A pushes the pages to the current copy
2456                                 * object.
2457                                 * A returns the UPL to the caller.
2458                                 * B comes along and establishes another
2459                                 * private mapping on this object, inserting 
2460                                 * a new copy object between the original
2461                                 * object and the old copy object.
2462                                 * B reads a page and gets the original contents
2463                                 * from the original object.
2464                                 * A modifies the page in the original object.
2465                                 * B reads the page again and sees A's changes,
2466                                 * which is wrong...
2467                                 *
2468                                 * The problem is that the pages are not
2469                                 * marked "busy" in the original object, so
2470                                 * nothing prevents B from reading it before
2471                                 * before A's changes are completed.
2472                                 *
2473                                 * The "paging_in_progress" might protect us
2474                                 * from the insertion of a new copy object
2475                                 * though...  To be verified.
2476                                 */
2477                                vm_object_lock_request(object,
2478                                                       offset,
2479                                                       size,
2480                                                       FALSE,
2481                                                       MEMORY_OBJECT_COPY_SYNC,
2482                                                       VM_PROT_NO_CHANGE);
2483                                upl_cow_contiguous++;
2484                                upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2485                        }
2486
2487                        upl->map_object = object;
2488                        /* don't need any shadow mappings for this one */
2489                        /* since it is already I/O memory */
2490                        upl->flags |= UPL_DEVICE_MEMORY;
2491
2492
2493                        /* paging_in_progress protects paging_offset */
2494                        upl->offset = offset + object->paging_offset;
2495                        upl->size = size;
2496                        *upl_ptr = upl;
2497                        if(user_page_list) {
2498                                user_page_list[0].phys_addr = 
2499                                   (offset + object->shadow_offset)>>PAGE_SHIFT;
2500                                user_page_list[0].device = TRUE;
2501                        }
2502
2503                        if(page_list_count != NULL) {
2504                                if (upl->flags & UPL_INTERNAL) {
2505                                        *page_list_count = 0;
2506                                } else {
2507                                        *page_list_count = 1;
2508                                }
2509                        }
2510
2511                        return KERN_SUCCESS;
2512                }
2513
2514                if(user_page_list)
2515                        user_page_list[0].device = FALSE;
2516
2517                if(cntrl_flags & UPL_SET_LITE) {
2518                        upl->map_object = object;
2519                } else {
2520                        upl->map_object = vm_object_allocate(size);
2521                        /*
2522                         * No neeed to lock the new object: nobody else knows
2523                         * about it yet, so it's all ours so far.
2524                         */
2525                        upl->map_object->shadow = object;
2526                        upl->map_object->pageout = TRUE;
2527                        upl->map_object->can_persist = FALSE;
2528                        upl->map_object->copy_strategy = 
2529                                        MEMORY_OBJECT_COPY_NONE;
2530                        upl->map_object->shadow_offset = offset;
2531                        upl->map_object->wimg_bits = object->wimg_bits;
2532                }
2533
2534        }
2535        if (!(cntrl_flags & UPL_SET_LITE)) {
2536                VM_PAGE_GRAB_FICTITIOUS(alias_page);
2537        }
2538
2539        /*
2540         * ENCRYPTED SWAP:
2541         * Just mark the UPL as "encrypted" here.
2542         * We'll actually encrypt the pages later,
2543         * in upl_encrypt(), when the caller has
2544         * selected which pages need to go to swap.
2545         */
2546        if (cntrl_flags & UPL_ENCRYPT) {
2547                upl->flags |= UPL_ENCRYPTED;
2548        }
2549        if (cntrl_flags & UPL_FOR_PAGEOUT) {
2550                upl->flags |= UPL_PAGEOUT;
2551        }
2552        vm_object_lock(object);
2553
2554        /* we can lock in the paging_offset once paging_in_progress is set */
2555        if(upl_ptr) {
2556                upl->size = size;
2557                upl->offset = offset + object->paging_offset;
2558                *upl_ptr = upl;
2559#ifdef UPL_DEBUG
2560                queue_enter(&object->uplq, upl, upl_t, uplq);
2561#endif /* UPL_DEBUG */
2562        }
2563
2564        if ((cntrl_flags & UPL_WILL_MODIFY) &&
2565            object->copy != VM_OBJECT_NULL) {
2566                /* Honor copy-on-write obligations */
2567
2568                /*
2569                 * The caller is gathering these pages and
2570                 * might modify their contents.  We need to
2571                 * make sure that the copy object has its own
2572                 * private copies of these pages before we let
2573                 * the caller modify them.
2574                 */
2575                vm_object_update(object,
2576                                 offset,
2577                                 size,
2578                                 NULL,
2579                                 NULL,
2580                                 FALSE, /* should_return */
2581                                 MEMORY_OBJECT_COPY_SYNC,
2582                                 VM_PROT_NO_CHANGE);
2583                upl_cow++;
2584                upl_cow_pages += size >> PAGE_SHIFT;
2585                
2586        }
2587        /* remember which copy object we synchronized with */
2588        last_copy_object = object->copy;
2589
2590        entry = 0;
2591        if(cntrl_flags & UPL_COPYOUT_FROM) {
2592                upl->flags |= UPL_PAGE_SYNC_DONE;
2593
2594                while (xfer_size) {
2595                        if((alias_page == NULL) && 
2596                                !(cntrl_flags & UPL_SET_LITE)) {
2597                                vm_object_unlock(object);
2598                                VM_PAGE_GRAB_FICTITIOUS(alias_page);
2599                                vm_object_lock(object);
2600                        }
2601                        if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2602                                dst_page->fictitious ||
2603                                dst_page->absent ||
2604                                dst_page->error ||
2605                               (dst_page->wire_count && !dst_page->pageout) ||
2606
2607                             ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2608                               (dst_page->page_ticket != page_ticket) && 
2609                              ((dst_page->page_ticket+1) != page_ticket)) ) {
2610
2611                                if (user_page_list)
2612                                        user_page_list[entry].phys_addr = 0;
2613                        } else { 
2614                                /*
2615                                 * grab this up front...
2616                                 * a high percentange of the time we're going to
2617                                 * need the hardware modification state a bit later
2618                                 * anyway... so we can eliminate an extra call into
2619                                 * the pmap layer by grabbing it here and recording it
2620                                 */
2621                                refmod_state = pmap_get_refmod(dst_page->phys_page);
2622                                        
2623                                if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2624                                        /*
2625                                         * we're only asking for DIRTY pages to be returned
2626                                         */
2627
2628                                        if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2629                                                /*
2630                                                 * if we were the page stolen by vm_pageout_scan to be
2631                                                 * cleaned (as opposed to a buddy being clustered in 
2632                                                 * or this request is not being driven by a PAGEOUT cluster
2633                                                 * then we only need to check for the page being diry or
2634                                                 * precious to decide whether to return it
2635                                                 */
2636                                                if (dst_page->dirty || dst_page->precious ||
2637                                                    (refmod_state & VM_MEM_MODIFIED)) {
2638                                                        goto check_busy;
2639                                                }
2640                                        }
2641                                        /*
2642                                         * this is a request for a PAGEOUT cluster and this page
2643                                         * is merely along for the ride as a 'buddy'... not only
2644                                         * does it have to be dirty to be returned, but it also
2645                                         * can't have been referenced recently... note that we've
2646                                         * already filtered above based on whether this page is
2647                                         * currently on the inactive queue or it meets the page
2648                                         * ticket (generation count) check
2649                                         */
2650                                        if ( !(refmod_state & VM_MEM_REFERENCED) && 
2651                                             ((refmod_state & VM_MEM_MODIFIED) ||
2652                                              dst_page->dirty || dst_page->precious) ) {
2653                                                goto check_busy;
2654                                        }
2655                                        /*
2656                                         * if we reach here, we're not to return
2657                                         * the page... go on to the next one
2658                                         */
2659                                        if (user_page_list)
2660                                                user_page_list[entry].phys_addr = 0;
2661                                        entry++;
2662                                        dst_offset += PAGE_SIZE_64;
2663                                        xfer_size -= PAGE_SIZE;
2664                                        continue;
2665                                }
2666check_busy:                     
2667                                if(dst_page->busy && 
2668                                        (!(dst_page->list_req_pending && 
2669                                                dst_page->pageout))) {
2670                                        if(cntrl_flags & UPL_NOBLOCK) {
2671                                                if(user_page_list) {
2672                                                        user_page_list[entry].phys_addr = 0;
2673                                                }
2674                                                entry++;
2675                                                dst_offset += PAGE_SIZE_64;
2676                                                xfer_size -= PAGE_SIZE;
2677                                                continue;
2678                                        }
2679                                        /*
2680                                         * someone else is playing with the
2681                                         * page.  We will have to wait.
2682                                         */
2683                                        PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2684                                        continue;
2685                                }
2686                                /* Someone else already cleaning the page? */
2687                                if((dst_page->cleaning || dst_page->absent ||
2688                                        dst_page->wire_count != 0) && 
2689                                        !dst_page->list_req_pending) {
2690                                   if(user_page_list) {
2691                                           user_page_list[entry].phys_addr = 0;
2692                                   }
2693                                   entry++;
2694                                   dst_offset += PAGE_SIZE_64;
2695                                   xfer_size -= PAGE_SIZE;
2696                                   continue;
2697                                }
2698                                /* eliminate all mappings from the */
2699                                /* original object and its prodigy */
2700                                
2701                                vm_page_lock_queues();
2702
2703                                if (dst_page->pageout_queue == TRUE)
2704                                        /*
2705                                         * we've buddied up a page for a clustered pageout
2706                                         * that has already been moved to the pageout
2707                                         * queue by pageout_scan... we need to remove
2708                                         * it from the queue and drop the laundry count
2709                                         * on that queue
2710                                         */
2711                                        vm_pageout_queue_steal(dst_page);
2712#if MACH_CLUSTER_STATS
2713                                /* pageout statistics gathering.  count  */
2714                                /* all the pages we will page out that   */
2715                                /* were not counted in the initial       */
2716                                /* vm_pageout_scan work                  */
2717                                if(dst_page->list_req_pending)
2718                                        encountered_lrp = TRUE;
2719                                if((dst_page->dirty ||
2720                                        (dst_page->object->internal &&
2721                                        dst_page->precious)) &&
2722                                        (dst_page->list_req_pending 
2723                                        == FALSE)) {
2724                                        if(encountered_lrp) {
2725                                                CLUSTER_STAT
2726                                                (pages_at_higher_offsets++;)
2727                                        } else {
2728                                                CLUSTER_STAT
2729                                                (pages_at_lower_offsets++;)
2730                                        }
2731                                }
2732#endif
2733                                /* Turn off busy indication on pending */
2734                                /* pageout.  Note: we can only get here */
2735                                /* in the request pending case.  */
2736                                dst_page->list_req_pending = FALSE;
2737                                dst_page->busy = FALSE;
2738                                dst_page->cleaning = FALSE;
2739
2740                                hw_dirty = refmod_state & VM_MEM_MODIFIED;
2741                                dirty = hw_dirty ? TRUE : dst_page->dirty;
2742
2743                                if(cntrl_flags & UPL_SET_LITE) {
2744                                        int     pg_num;
2745                                        pg_num = (dst_offset-offset)/PAGE_SIZE;
2746                                        lite_list[pg_num>>5] |= 
2747                                                        1 << (pg_num & 31);
2748                                        if (hw_dirty)
2749                                                pmap_clear_modify(dst_page->phys_page);
2750                                        /*
2751                                         * Record that this page has been 
2752                                         * written out
2753                                         */
2754#if     MACH_PAGEMAP
2755                                        vm_external_state_set(
2756                                                object->existence_map, 
2757                                                dst_page->offset);
2758#endif  /*MACH_PAGEMAP*/
2759
2760                                        /*
2761                                         * Mark original page as cleaning 
2762                                         * in place.
2763                                         */
2764                                        dst_page->cleaning = TRUE;
2765                                        dst_page->dirty = TRUE;
2766                                        dst_page->precious = FALSE;
2767                                } else {
2768                                        /* use pageclean setup, it is more */
2769                                        /* convenient even for the pageout */
2770                                        /* cases here */
2771
2772                                        vm_object_lock(upl->map_object);
2773                                        vm_pageclean_setup(dst_page, 
2774                                                alias_page, upl->map_object, 
2775                                                size - xfer_size);
2776                                        vm_object_unlock(upl->map_object);
2777
2778                                        alias_page->absent = FALSE;
2779                                        alias_page = NULL;
2780                                }
2781                                                
2782                                if(!dirty) {
2783                                        dst_page->dirty = FALSE;
2784                                        dst_page->precious = TRUE;
2785                                }
2786
2787                                if(dst_page->pageout)
2788                                        dst_page->busy = TRUE;
2789
2790                                if ( (cntrl_flags & UPL_ENCRYPT) ) {
2791                                        /*
2792                                         * ENCRYPTED SWAP:
2793                                         * We want to deny access to the target page
2794                                         * because its contents are about to be
2795                                         * encrypted and the user would be very
2796                                         * confused to see encrypted data instead
2797                                         * of their data.
2798                                         */
2799                                        dst_page->busy = TRUE;
2800                                }
2801                                if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2802                                        /*
2803                                         * deny access to the target page
2804                                         * while it is being worked on
2805                                         */
2806                                        if ((!dst_page->pageout) &&
2807                                            (dst_page->wire_count == 0)) {
2808                                                dst_page->busy = TRUE;
2809                                                dst_page->pageout = TRUE;
2810                                                vm_page_wire(dst_page);
2811                                        }
2812                                }
2813
2814                                if(user_page_list) {
2815                                        user_page_list[entry].phys_addr
2816                                                = dst_page->phys_page;
2817                                        user_page_list[entry].dirty =   
2818                                                        dst_page->dirty;
2819                                        user_page_list[entry].pageout =
2820                                                        dst_page->pageout;
2821                                        user_page_list[entry].absent =
2822                                                        dst_page->absent;
2823                                        user_page_list[entry].precious =
2824                                                        dst_page->precious;
2825                                }
2826                                vm_page_unlock_queues();
2827
2828                                /*
2829                                 * ENCRYPTED SWAP:
2830                                 * The caller is gathering this page and might
2831                                 * access its contents later on.  Decrypt the
2832                                 * page before adding it to the UPL, so that
2833                                 * the caller never sees encrypted data.
2834                                 */
2835                                if (! (cntrl_flags & UPL_ENCRYPT) &&
2836                                    dst_page->encrypted) {
2837                                        assert(dst_page->busy);
2838
2839                                        vm_page_decrypt(dst_page, 0);
2840                                        vm_page_decrypt_for_upl_counter++;
2841
2842                                        /*
2843                                         * Retry this page, since anything
2844                                         * could have changed while we were
2845                                         * decrypting.
2846                                         */
2847                                        continue;
2848                                }
2849                        }
2850                        entry++;
2851                        dst_offset += PAGE_SIZE_64;
2852                        xfer_size -= PAGE_SIZE;
2853                }
2854        } else {
2855                while (xfer_size) {
2856                        if((alias_page == NULL) && 
2857                                !(cntrl_flags & UPL_SET_LITE)) {
2858                                vm_object_unlock(object);
2859                                VM_PAGE_GRAB_FICTITIOUS(alias_page);
2860                                vm_object_lock(object);
2861                        }
2862
2863                        if ((cntrl_flags & UPL_WILL_MODIFY) &&
2864                            object->copy != last_copy_object) {
2865                                /* Honor copy-on-write obligations */
2866
2867                                /*
2868                                 * The copy object has changed since we
2869                                 * last synchronized for copy-on-write.
2870                                 * Another copy object might have been
2871                                 * inserted while we released the object's
2872                                 * lock.  Since someone could have seen the
2873                                 * original contents of the remaining pages
2874                                 * through that new object, we have to
2875                                 * synchronize with it again for the remaining
2876                                 * pages only.  The previous pages are "busy"
2877                                 * so they can not be seen through the new
2878                                 * mapping.  The new mapping will see our
2879                                 * upcoming changes for those previous pages,
2880                                 * but that's OK since they couldn't see what
2881                                 * was there before.  It's just a race anyway
2882                                 * and there's no guarantee of consistency or
2883                                 * atomicity.  We just don't want new mappings
2884                                 * to see both the *before* and *after* pages.
2885                                 */
2886                                if (object->copy != VM_OBJECT_NULL) {
2887                                        vm_object_update(
2888                                                object,
2889                                                dst_offset,/* current offset */
2890                                                xfer_size, /* remaining size */
2891                                                NULL,
2892                                                NULL,
2893                                                FALSE,     /* should_return */
2894                                                MEMORY_OBJECT_COPY_SYNC,
2895                                                VM_PROT_NO_CHANGE);
2896                                        upl_cow_again++;
2897                                        upl_cow_again_pages +=
2898                                                xfer_size >> PAGE_SHIFT;
2899                                }
2900                                /* remember the copy object we synced with */
2901                                last_copy_object = object->copy;
2902                        }
2903
2904                        dst_page = vm_page_lookup(object, dst_offset);
2905                        
2906                        if(dst_page != VM_PAGE_NULL) {
2907                                if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2908                                        !((dst_page->list_req_pending)
2909                                                && (dst_page->absent))) {
2910                                        /* we are doing extended range */
2911                                        /* requests.  we want to grab  */
2912                                        /* pages around some which are */
2913                                        /* already present.  */
2914                                        if(user_page_list) {
2915                                                user_page_list[entry].phys_addr = 0;
2916                                        }
2917                                        entry++;
2918                                        dst_offset += PAGE_SIZE_64;
2919                                        xfer_size -= PAGE_SIZE;
2920                                        continue;
2921                                }
2922                                if((dst_page->cleaning) && 
2923                                   !(dst_page->list_req_pending)) {
2924                                        /*someone else is writing to the */
2925                                        /* page.  We will have to wait.  */
2926                                        PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2927                                        continue;
2928                                }
2929                                if ((dst_page->fictitious && 
2930                                     dst_page->list_req_pending)) {
2931                                        /* dump the fictitious page */
2932                                        dst_page->list_req_pending = FALSE;
2933                                        dst_page->clustered = FALSE;
2934
2935                                        vm_page_lock_queues();
2936                                        vm_page_free(dst_page);
2937                                        vm_page_unlock_queues();
2938
2939                                        dst_page = NULL;
2940                                } else if ((dst_page->absent && 
2941                                            dst_page->list_req_pending)) {
2942                                        /* the default_pager case */
2943                                        dst_page->list_req_pending = FALSE;
2944                                        dst_page->busy = FALSE;
2945                                }
2946                        }
2947                        if(dst_page == VM_PAGE_NULL) {
2948                                if(object->private) {
2949                                        /* 
2950                                         * This is a nasty wrinkle for users 
2951                                         * of upl who encounter device or 
2952                                         * private memory however, it is 
2953                                         * unavoidable, only a fault can
2954                                         * reslove the actual backing
2955                                         * physical page by asking the
2956                                         * backing device.
2957                                         */
2958                                        if(user_page_list) {
2959                                                user_page_list[entry].phys_addr = 0;
2960                                        }
2961                                        entry++;
2962                                        dst_offset += PAGE_SIZE_64;
2963                                        xfer_size -= PAGE_SIZE;
2964                                        continue;
2965                                }
2966                                /* need to allocate a page */
2967                                dst_page = vm_page_alloc(object, dst_offset);
2968                                if (dst_page == VM_PAGE_NULL) {
2969                                        vm_object_unlock(object);
2970                                        VM_PAGE_WAIT();
2971                                        vm_object_lock(object);
2972                                        continue;
2973                                }
2974                                dst_page->busy = FALSE;
2975#if 0
2976                                if(cntrl_flags & UPL_NO_SYNC) {
2977                                        dst_page->page_lock = 0;
2978                                        dst_page->unlock_request = 0;
2979                                }
2980#endif
2981                                if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2982                                        /*
2983                                         * if UPL_RET_ONLY_ABSENT was specified,
2984                                         * than we're definitely setting up a
2985                                         * upl for a clustered read/pagein 
2986                                         * operation... mark the pages as clustered
2987                                         * so vm_fault can correctly attribute them
2988                                         * to the 'pagein' bucket the first time
2989                                         * a fault happens on them
2990                                         */
2991                                        dst_page->clustered = TRUE;
2992                                }
2993                                dst_page->absent = TRUE;
2994                                object->absent_count++;
2995                        }
2996#if 1
2997                        if(cntrl_flags & UPL_NO_SYNC) {
2998                                dst_page->page_lock = 0;
2999                                dst_page->unlock_request = 0;
3000                        }
3001#endif /* 1 */
3002
3003                        /*
3004                         * ENCRYPTED SWAP:
3005                         */
3006                        if (cntrl_flags & UPL_ENCRYPT) {
3007                                /*
3008                                 * The page is going to be encrypted when we
3009                                 * get it from the pager, so mark it so.
3010                                 */
3011                                dst_page->encrypted = TRUE;
3012                        } else {
3013                                /*
3014                                 * Otherwise, the page will not contain
3015                                 * encrypted data.
3016                                 */
3017                                dst_page->encrypted = FALSE;
3018                        }
3019
3020                        dst_page->overwriting = TRUE;
3021                        if(dst_page->fictitious) {
3022                                panic("need corner case for fictitious page");
3023                        }
3024                        if(dst_page->page_lock) {
3025                                do_m_lock = TRUE;
3026                        }
3027                        if(upl_ptr) {
3028
3029                                /* eliminate all mappings from the */
3030                                /* original object and its prodigy */
3031                                
3032                                if(dst_page->busy) {
3033                                        /*someone else is playing with the */
3034                                        /* page.  We will have to wait.    */
3035                                        PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3036                                        continue;
3037                                }
3038                                vm_page_lock_queues();
3039
3040                                if( !(cntrl_flags & UPL_FILE_IO))
3041                                        hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3042                                else
3043                                        hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3044                                dirty = hw_dirty ? TRUE : dst_page->dirty;
3045
3046                                if(cntrl_flags & UPL_SET_LITE) {
3047                                        int     pg_num;
3048                                        pg_num = (dst_offset-offset)/PAGE_SIZE;
3049                                        lite_list[pg_num>>5] |= 
3050                                                        1 << (pg_num & 31);
3051                                        if (hw_dirty)
3052                                                pmap_clear_modify(dst_page->phys_page);
3053                                        /*
3054                                         * Record that this page has been 
3055                                         * written out
3056                                         */
3057#if     MACH_PAGEMAP
3058                                        vm_external_state_set(
3059                                                object->existence_map, 
3060                                                dst_page->offset);
3061#endif  /*MACH_PAGEMAP*/
3062
3063                                        /*
3064                                         * Mark original page as cleaning 
3065                                         * in place.
3066                                         */
3067                                        dst_page->cleaning = TRUE;
3068                                        dst_page->dirty = TRUE;
3069                                        dst_page->precious = FALSE;
3070                                } else {
3071                                        /* use pageclean setup, it is more */
3072                                        /* convenient even for the pageout */
3073                                        /* cases here */
3074                                        vm_object_lock(upl->map_object);
3075                                        vm_pageclean_setup(dst_page, 
3076                                                alias_page, upl->map_object, 
3077                                                size - xfer_size);
3078                                        vm_object_unlock(upl->map_object);
3079
3080                                        alias_page->absent = FALSE;
3081                                        alias_page = NULL;
3082                                }
3083
3084                                if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3085                                        /* clean in place for read implies   */
3086                                        /* that a write will be done on all  */
3087                                        /* the pages that are dirty before   */
3088                                        /* a upl commit is done.  The caller */
3089                                        /* is obligated to preserve the      */
3090                                        /* contents of all pages marked      */
3091                                        /* dirty. */
3092                                        upl->flags |= UPL_CLEAR_DIRTY;
3093                                }
3094
3095                                if(!dirty) {
3096                                        dst_page->dirty = FALSE;
3097                                        dst_page->precious = TRUE;
3098                                }
3099                                                
3100                                if (dst_page->wire_count == 0) {
3101                                   /* deny access to the target page while */
3102                                   /* it is being worked on */
3103                                        dst_page->busy = TRUE;
3104                                } else {
3105                                        vm_page_wire(dst_page);
3106                                }
3107                                if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3108                                        /*
3109                                         * expect the page not to be used
3110                                         * since it's coming in as part
3111                                         * of a cluster and could be 
3112                                         * speculative... pages that
3113                                         * are 'consumed' will get a
3114                                         * hardware reference
3115                                         */
3116                                        dst_page->reference = FALSE;
3117                                } else {
3118                                        /*
3119                                         * expect the page to be used
3120                                         */
3121                                        dst_page->reference = TRUE;
3122                                }
3123                                dst_page->precious = 
3124                                        (cntrl_flags & UPL_PRECIOUS) 
3125                                                        ? TRUE : FALSE;
3126                                if(user_page_list) {
3127                                        user_page_list[entry].phys_addr
3128                                                = dst_page->phys_page;
3129                                        user_page_list[entry].dirty =
3130                                                        dst_page->dirty;
3131                                        user_page_list[entry].pageout =
3132                                                        dst_page->pageout;
3133                                        user_page_list[entry].absent =
3134                                                        dst_page->absent;
3135                                        user_page_list[entry].precious =
3136                                                        dst_page->precious;
3137                                }
3138                                vm_page_unlock_queues();
3139                        }
3140                        entry++;
3141                        dst_offset += PAGE_SIZE_64;
3142                        xfer_size -= PAGE_SIZE;
3143                }
3144        }
3145
3146        if (upl->flags & UPL_INTERNAL) {
3147                if(page_list_count != NULL)
3148                        *page_list_count = 0;
3149        } else if (*page_list_count > entry) {
3150                if(page_list_count != NULL)
3151                        *page_list_count = entry;
3152        }
3153
3154        if(alias_page != NULL) {
3155                vm_page_lock_queues();
3156                vm_page_free(alias_page);
3157                vm_page_unlock_queues();
3158        }
3159
3160        if(do_m_lock) {
3161           vm_prot_t    access_required;
3162           /* call back all associated pages from other users of the pager */
3163           /* all future updates will be on data which is based on the     */
3164           /* changes we are going to make here. Note: it is assumed that  */
3165           /* we already hold copies of the data so we will not be seeing  */
3166           /* an avalanche of incoming data from the pager */
3167           access_required = (cntrl_flags & UPL_COPYOUT_FROM) 
3168                                        ? VM_PROT_READ : VM_PROT_WRITE;
3169           while (TRUE) {
3170                kern_return_t   rc;
3171
3172                if(!object->pager_ready) {
3173                   wait_result_t wait_result;
3174
3175                   wait_result = vm_object_sleep(object, 
3176                                                VM_OBJECT_EVENT_PAGER_READY,
3177                                                THREAD_UNINT);
3178                   if (wait_result !=  THREAD_AWAKENED) {
3179                        vm_object_unlock(object);
3180                        return KERN_FAILURE;
3181                   }
3182                   continue;
3183                }
3184
3185                vm_object_unlock(object);
3186                rc = memory_object_data_unlock(
3187                        object->pager,
3188                        dst_offset + object->paging_offset,
3189                        size,
3190                        access_required);
3191                if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3192                        return KERN_FAILURE;
3193                vm_object_lock(object);
3194
3195                if (rc == KERN_SUCCESS)
3196                        break;
3197           }
3198
3199           /* lets wait on the last page requested */
3200           /* NOTE: we will have to update lock completed routine to signal */
3201           if(dst_page != VM_PAGE_NULL && 
3202                (access_required & dst_page->page_lock) != access_required) {
3203                PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3204                vm_object_unlock(object);
3205                thread_block(THREAD_CONTINUE_NULL);
3206                return KERN_SUCCESS;
3207           }
3208        }
3209
3210        vm_object_unlock(object);
3211        return KERN_SUCCESS;
3212}
3213
3214/* JMM - Backward compatability for now */
3215kern_return_t
3216vm_fault_list_request(                  /* forward */
3217        memory_object_control_t         control,
3218        vm_object_offset_t      offset,
3219        upl_size_t              size,
3220        upl_t                   *upl_ptr,
3221        upl_page_info_t         **user_page_list_ptr,
3222        int                     page_list_count,
3223        int                     cntrl_flags);
3224kern_return_t
3225vm_fault_list_request(
3226        memory_object_control_t         control,
3227        vm_object_offset_t      offset,
3228        upl_size_t              size,
3229        upl_t                   *upl_ptr,
3230        upl_page_info_t         **user_page_list_ptr,
3231        int                     page_list_count,
3232        int                     cntrl_flags)
3233{
3234        int                     local_list_count;
3235        upl_page_info_t         *user_page_list;
3236        kern_return_t           kr;
3237
3238        if (user_page_list_ptr != NULL) {
3239                local_list_count = page_list_count;
3240                user_page_list = *user_page_list_ptr;
3241        } else {
3242                local_list_count = 0;
3243                user_page_list = NULL;
3244        }
3245        kr =  memory_object_upl_request(control,
3246                                offset,
3247                                size,
3248                                upl_ptr,
3249                                user_page_list,
3250                                &local_list_count,
3251                                cntrl_flags);
3252
3253        if(kr != KERN_SUCCESS)
3254                return kr;
3255
3256        if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3257                *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3258        }
3259
3260        return KERN_SUCCESS;
3261}
3262
3263                
3264
3265/*  
3266 *      Routine:        vm_object_super_upl_request
3267 *      Purpose:        
3268 *              Cause the population of a portion of a vm_object
3269 *              in much the same way as memory_object_upl_request.
3270 *              Depending on the nature of the request, the pages
3271 *              returned may be contain valid data or be uninitialized.
3272 *              However, the region may be expanded up to the super
3273 *              cluster size provided.
3274 */
3275
3276__private_extern__ kern_return_t
3277vm_object_super_upl_request(
3278        vm_object_t object,
3279        vm_object_offset_t      offset,
3280        upl_size_t              size,
3281        upl_size_t              super_cluster,
3282        upl_t                   *upl,
3283        upl_page_info_t         *user_page_list,
3284        unsigned int            *page_list_count,
3285        int                     cntrl_flags)
3286{
3287        vm_page_t       target_page;
3288        int             ticket;
3289
3290
3291        if(object->paging_offset > offset)
3292                return KERN_FAILURE;
3293
3294        assert(object->paging_in_progress);
3295        offset = offset - object->paging_offset;
3296
3297        if(cntrl_flags & UPL_FOR_PAGEOUT) {
3298          
3299                vm_object_lock(object);
3300
3301                if((target_page = vm_page_lookup(object, offset))
3302                                                        != VM_PAGE_NULL) {
3303                        ticket = target_page->page_ticket;
3304                        cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3305                        cntrl_flags = cntrl_flags | 
3306                                ((ticket << UPL_PAGE_TICKET_SHIFT) 
3307                                                        & UPL_PAGE_TICKET_MASK);
3308                }
3309                vm_object_unlock(object);
3310        }
3311
3312        if (super_cluster > size) {
3313
3314                vm_object_offset_t      base_offset;
3315                upl_size_t              super_size;
3316
3317                base_offset = (offset &  
3318                        ~((vm_object_offset_t) super_cluster - 1));
3319                super_size = (offset+size) > (base_offset + super_cluster) ?
3320                                super_cluster<<1 : super_cluster;
3321                super_size = ((base_offset + super_size) > object->size) ? 
3322                                (object->size - base_offset) : super_size;
3323                if(offset > (base_offset + super_size))
3324                   panic("vm_object_super_upl_request: Missed target pageout"
3325                         " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3326                         offset, base_offset, super_size, super_cluster,
3327                         size, object->paging_offset);
3328                /*
3329                 * apparently there is a case where the vm requests a
3330                 * page to be written out who's offset is beyond the
3331                 * object size
3332                 */
3333                if((offset + size) > (base_offset + super_size))
3334                   super_size = (offset + size) - base_offset;
3335
3336                offset = base_offset;
3337                size = super_size;
3338        }
3339        return vm_object_upl_request(object, offset, size,
3340                                     upl, user_page_list, page_list_count,
3341                                     cntrl_flags);
3342}
3343
3344                                 
3345kern_return_t
3346vm_map_create_upl(
3347        vm_map_t                map,
3348        vm_map_address_t        offset,
3349        upl_size_t              *upl_size,
3350        upl_t                   *upl,
3351        upl_page_info_array_t   page_list,
3352        unsigned int            *count,
3353        int                     *flags)
3354{
3355        vm_map_entry_t  entry;
3356        int             caller_flags;
3357        int             force_data_sync;
3358        int             sync_cow_data;
3359        vm_object_t     local_object;
3360        vm_map_offset_t local_offset;
3361        vm_map_offset_t local_start;
3362        kern_return_t   ret;
3363
3364        caller_flags = *flags;
3365
3366        if (caller_flags & ~UPL_VALID_FLAGS) {
3367                /*
3368                 * For forward compatibility's sake,
3369                 * reject any unknown flag.
3370                 */
3371                return KERN_INVALID_VALUE;
3372        }
3373
3374        force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3375        sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3376
3377        if(upl == NULL)
3378                return KERN_INVALID_ARGUMENT;
3379
3380
3381REDISCOVER_ENTRY:
3382        vm_map_lock(map);
3383        if (vm_map_lookup_entry(map, offset, &entry)) {
3384                if (entry->object.vm_object == VM_OBJECT_NULL ||
3385                        !entry->object.vm_object->phys_contiguous) {
3386                        if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3387                                *upl_size = MAX_UPL_TRANSFER * page_size;
3388                        }
3389                }
3390                if((entry->vme_end - offset) < *upl_size) {
3391                        *upl_size = entry->vme_end - offset;
3392                }
3393                if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3394                        if (entry->object.vm_object == VM_OBJECT_NULL) {
3395                                *flags = 0;
3396                        } else if (entry->object.vm_object->private) {
3397                                *flags = UPL_DEV_MEMORY;
3398                                if (entry->object.vm_object->phys_contiguous) {
3399                                        *flags |= UPL_PHYS_CONTIG;
3400                                }
3401                        } else  {
3402                                *flags = 0;
3403                        }
3404                        vm_map_unlock(map);
3405                        return KERN_SUCCESS;
3406                }
3407                /*
3408                 *      Create an object if necessary.
3409                 */
3410                if (entry->object.vm_object == VM_OBJECT_NULL) {
3411                        entry->object.vm_object = vm_object_allocate(
3412                                (vm_size_t)(entry->vme_end - entry->vme_start));
3413                        entry->offset = 0;
3414                }
3415                if (!(caller_flags & UPL_COPYOUT_FROM)) {
3416                        if (!(entry->protection & VM_PROT_WRITE)) {
3417                                vm_map_unlock(map);
3418                                return KERN_PROTECTION_FAILURE;
3419                        }
3420                        if (entry->needs_copy)  {
3421                                vm_map_t                local_map;
3422                                vm_object_t             object;
3423                                vm_map_offset_t         offset_hi;
3424                                vm_map_offset_t         offset_lo;
3425                                vm_object_offset_t      new_offset;
3426                                vm_prot_t               prot;
3427                                boolean_t               wired;
3428                                vm_behavior_t           behavior;
3429                                vm_map_version_t        version;
3430                                vm_map_t                real_map;
3431
3432                                local_map = map;
3433                                vm_map_lock_write_to_read(map);
3434                                if(vm_map_lookup_locked(&local_map,
3435                                        offset, VM_PROT_WRITE,
3436                                        &version, &object,
3437                                        &new_offset, &prot, &wired,
3438                                        &behavior, &offset_lo,
3439                                        &offset_hi, &real_map)) {
3440                                        vm_map_unlock(local_map);
3441                                        return KERN_FAILURE;
3442                                }
3443                                if (real_map != map) {
3444                                        vm_map_unlock(real_map);
3445                                }
3446                                vm_object_unlock(object);
3447                                vm_map_unlock(local_map);
3448
3449                                goto REDISCOVER_ENTRY;
3450                        }
3451                }
3452                if (entry->is_sub_map) {
3453                        vm_map_t        submap;
3454
3455                        submap = entry->object.sub_map;
3456                        local_start = entry->vme_start;
3457                        local_offset = entry->offset;
3458                        vm_map_reference(submap);
3459                        vm_map_unlock(map);
3460
3461                        ret = (vm_map_create_upl(submap, 
3462                                local_offset + (offset - local_start), 
3463                                upl_size, upl, page_list, count, 
3464                                flags));
3465
3466                        vm_map_deallocate(submap);
3467                        return ret;
3468                }
3469                                        
3470                if (sync_cow_data) {
3471                        if (entry->object.vm_object->shadow
3472                                    || entry->object.vm_object->copy) {
3473
3474                                local_object = entry->object.vm_object;
3475                                local_start = entry->vme_start;
3476                                local_offset = entry->offset;
3477                                vm_object_reference(local_object);
3478                                vm_map_unlock(map);
3479
3480                                if (entry->object.vm_object->shadow && 
3481                                           entry->object.vm_object->copy) {
3482                                   vm_object_lock_request(
3483                                        local_object->shadow,
3484                                        (vm_object_offset_t)
3485                                        ((offset - local_start) +
3486                                         local_offset) +
3487                                        local_object->shadow_offset,
3488                                        *upl_size, FALSE, 
3489                                        MEMORY_OBJECT_DATA_SYNC,
3490                                        VM_PROT_NO_CHANGE);
3491                                }
3492                                sync_cow_data = FALSE;
3493                                vm_object_deallocate(local_object);
3494                                goto REDISCOVER_ENTRY;
3495                        }
3496                }
3497
3498                if (force_data_sync) {
3499
3500                        local_object = entry->object.vm_object;
3501                        local_start = entry->vme_start;
3502                        local_offset = entry->offset;
3503                        vm_object_reference(local_object);
3504                        vm_map_unlock(map);
3505
3506                        vm_object_lock_request(
3507                                   local_object,
3508                                   (vm_object_offset_t)
3509                                   ((offset - local_start) + local_offset),
3510                                   (vm_object_size_t)*upl_size, FALSE, 
3511                                   MEMORY_OBJECT_DATA_SYNC,
3512                                   VM_PROT_NO_CHANGE);
3513                        force_data_sync = FALSE;
3514                        vm_object_deallocate(local_object);
3515                        goto REDISCOVER_ENTRY;
3516                }
3517
3518                if(!(entry->object.vm_object->private)) {
3519                        if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3520                                *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3521                        if(entry->object.vm_object->phys_contiguous) {
3522                                *flags = UPL_PHYS_CONTIG;
3523                        } else {
3524                                *flags = 0;
3525                        }
3526                } else {
3527                        *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3528                }
3529                local_object = entry->object.vm_object;
3530                local_offset = entry->offset;
3531                local_start = entry->vme_start;
3532                vm_object_reference(local_object);
3533                vm_map_unlock(map);
3534                if(caller_flags & UPL_SET_IO_WIRE) {
3535                        ret = (vm_object_iopl_request(local_object, 
3536                                (vm_object_offset_t)
3537                                   ((offset - local_start) 
3538                                                + local_offset),
3539                                *upl_size,
3540                                upl,
3541                                page_list,
3542                                count,
3543                                caller_flags));
3544                } else {
3545                        ret = (vm_object_upl_request(local_object, 
3546                                (vm_object_offset_t)
3547                                   ((offset - local_start) 
3548                                                + local_offset),
3549                                *upl_size,
3550                                upl,
3551                                page_list,
3552                                count,
3553                                caller_flags));
3554                }
3555                vm_object_deallocate(local_object);
3556                return(ret);
3557        } 
3558
3559        vm_map_unlock(map);
3560        return(KERN_FAILURE);
3561
3562}
3563
3564/*
3565 * Internal routine to enter a UPL into a VM map.
3566 * 
3567 * JMM - This should just be doable through the standard
3568 * vm_map_enter() API.
3569 */
3570kern_return_t
3571vm_map_enter_upl(
3572        vm_map_t                map, 
3573        upl_t                   upl, 
3574        vm_map_offset_t *dst_addr)
3575{
3576        vm_map_size_t           size;
3577        vm_object_offset_t      offset;
3578        vm_map_offset_t         addr;
3579        vm_page_t               m;
3580        kern_return_t           kr;
3581
3582        if (upl == UPL_NULL)
3583                return KERN_INVALID_ARGUMENT;
3584
3585        upl_lock(upl);
3586
3587        /* check to see if already mapped */
3588        if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3589                upl_unlock(upl);
3590                return KERN_FAILURE;
3591        }
3592
3593        if((!(upl->map_object->pageout)) &&     
3594                !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3595                                        (upl->map_object->phys_contiguous))) {
3596                vm_object_t             object;
3597                vm_page_t               alias_page;
3598                vm_object_offset_t      new_offset;
3599                int                     pg_num;
3600                wpl_array_t             lite_list;
3601
3602                if(upl->flags & UPL_INTERNAL) {
3603                        lite_list = (wpl_array_t) 
3604                                ((((uintptr_t)upl) + sizeof(struct upl))
3605                                + ((upl->size/PAGE_SIZE) 
3606                                                * sizeof(upl_page_info_t)));
3607                } else {
3608                        lite_list = (wpl_array_t)
3609                                (((uintptr_t)upl) + sizeof(struct upl));
3610                }
3611                object = upl->map_object;
3612                upl->map_object = vm_object_allocate(upl->size);
3613                vm_object_lock(upl->map_object);
3614                upl->map_object->shadow = object;
3615                upl->map_object->pageout = TRUE;
3616                upl->map_object->can_persist = FALSE;
3617                upl->map_object->copy_strategy = 
3618                                MEMORY_OBJECT_COPY_NONE;
3619                upl->map_object->shadow_offset = 
3620                                upl->offset - object->paging_offset;
3621                upl->map_object->wimg_bits = object->wimg_bits;
3622                offset = upl->map_object->shadow_offset;
3623                new_offset = 0;
3624                size = upl->size;
3625
3626                vm_object_lock(object);
3627
3628                while(size) {
3629                   pg_num = (new_offset)/PAGE_SIZE;
3630                   if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3631                        vm_object_unlock(object);
3632                        VM_PAGE_GRAB_FICTITIOUS(alias_page);
3633                        vm_object_lock(object);
3634                        m = vm_page_lookup(object, offset);
3635                        if (m == VM_PAGE_NULL) {
3636                                panic("vm_upl_map: page missing\n");
3637                        }
3638
3639                        vm_object_paging_begin(object);
3640
3641                        /*
3642                        * Convert the fictitious page to a private 
3643                         * shadow of the real page.
3644                         */
3645                        assert(alias_page->fictitious);
3646                        alias_page->fictitious = FALSE;
3647                        alias_page->private = TRUE;
3648                        alias_page->pageout = TRUE;
3649                        alias_page->phys_page = m->phys_page;
3650
3651                        vm_page_lock_queues();
3652                        vm_page_wire(alias_page);
3653                        vm_page_unlock_queues();
3654
3655                        /*
3656                         * ENCRYPTED SWAP:
3657                         * The virtual page ("m") has to be wired in some way
3658                         * here or its physical page ("m->phys_page") could
3659                         * be recycled at any time.
3660                         * Assuming this is enforced by the caller, we can't
3661                         * get an encrypted page here.  Since the encryption
3662                         * key depends on the VM page's "pager" object and
3663                         * the "paging_offset", we couldn't handle 2 pageable
3664                         * VM pages (with different pagers and paging_offsets)
3665                         * sharing the same physical page:  we could end up
3666                         * encrypting with one key (via one VM page) and
3667                         * decrypting with another key (via the alias VM page).
3668                         */
3669                        ASSERT_PAGE_DECRYPTED(m);
3670
3671                        vm_page_insert(alias_page, 
3672                                        upl->map_object, new_offset);
3673                        assert(!alias_page->wanted);
3674                        alias_page->busy = FALSE;
3675                        alias_page->absent = FALSE;
3676                   }
3677
3678                   size -= PAGE_SIZE;
3679                   offset += PAGE_SIZE_64;
3680                   new_offset += PAGE_SIZE_64;
3681                }
3682                vm_object_unlock(object);
3683                vm_object_unlock(upl->map_object);
3684        }
3685        if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3686                offset = upl->offset - upl->map_object->paging_offset;
3687        else
3688                offset = 0;
3689
3690        size = upl->size;
3691        
3692        vm_object_lock(upl->map_object);
3693        upl->map_object->ref_count++;
3694        vm_object_res_reference(upl->map_object);
3695        vm_object_unlock(upl->map_object);
3696
3697        *dst_addr = 0;
3698
3699
3700        /* NEED A UPL_MAP ALIAS */
3701        kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3702                VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3703                VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3704
3705        if (kr != KERN_SUCCESS) {
3706                upl_unlock(upl);
3707                return(kr);
3708        }
3709
3710        vm_object_lock(upl->map_object);
3711
3712        for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3713                m = vm_page_lookup(upl->map_object, offset);
3714                if(m) {
3715                   unsigned int cache_attr;
3716                   cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3717        
3718                   PMAP_ENTER(map->pmap, addr,
3719                                m, VM_PROT_ALL, 
3720                                cache_attr, TRUE);
3721                }
3722                offset+=PAGE_SIZE_64;
3723        }
3724        vm_object_unlock(upl->map_object);
3725
3726        upl->ref_count++;  /* hold a reference for the mapping */
3727        upl->flags |= UPL_PAGE_LIST_MAPPED;
3728        upl->kaddr = *dst_addr;
3729        upl_unlock(upl);
3730        return KERN_SUCCESS;
3731}
3732        
3733/*
3734 * Internal routine to remove a UPL mapping from a VM map.
3735 *
3736 * XXX - This should just be doable through a standard
3737 * vm_map_remove() operation.  Otherwise, implicit clean-up
3738 * of the target map won't be able to correctly remove
3739 * these (and release the reference on the UPL).  Having
3740 * to do this means we can't map these into user-space
3741 * maps yet.
3742 */
3743kern_return_t
3744vm_map_remove_upl(
3745        vm_map_t        map, 
3746        upl_t           upl)
3747{
3748        vm_address_t    addr;
3749        upl_size_t      size;
3750
3751        if (upl == UPL_NULL)
3752                return KERN_INVALID_ARGUMENT;
3753
3754        upl_lock(upl);
3755        if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3756                addr = upl->kaddr;
3757                size = upl->size;
3758                assert(upl->ref_count > 1);
3759                upl->ref_count--;               /* removing mapping ref */
3760                upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3761                upl->kaddr = (vm_offset_t) 0;
3762                upl_unlock(upl);
3763
3764                vm_map_remove(  map,
3765                                vm_map_trunc_page(addr),
3766                                vm_map_round_page(addr + size),
3767                                VM_MAP_NO_FLAGS);
3768                return KERN_SUCCESS;
3769        }
3770        upl_unlock(upl);
3771        return KERN_FAILURE;
3772}
3773
3774kern_return_t
3775upl_commit_range(
3776        upl_t                   upl, 
3777        upl_offset_t            offset, 
3778        upl_size_t              size,
3779        int                     flags,
3780        upl_page_info_t         *page_list,
3781        mach_msg_type_number_t  count,
3782        boolean_t               *empty) 
3783{
3784        upl_size_t              xfer_size = size;
3785        vm_object_t             shadow_object;
3786        vm_object_t             object = upl->map_object;
3787        vm_object_offset_t      target_offset;
3788        int                     entry;
3789        wpl_array_t             lite_list;
3790        int                     occupied;
3791        int                     delayed_unlock = 0;
3792        int                     clear_refmod = 0;
3793        boolean_t               shadow_internal;
3794
3795        *empty = FALSE;
3796
3797        if (upl == UPL_NULL)
3798                return KERN_INVALID_ARGUMENT;
3799
3800
3801        if (count == 0)
3802                page_list = NULL;
3803
3804        if (object->pageout) {
3805                shadow_object = object->shadow;
3806        } else {
3807                shadow_object = object;
3808        }
3809
3810        upl_lock(upl);
3811
3812        if (upl->flags & UPL_ACCESS_BLOCKED) {
3813                /*
3814                 * We used this UPL to block access to the pages by marking
3815                 * them "busy".  Now we need to clear the "busy" bit to allow
3816                 * access to these pages again.
3817                 */
3818                flags |= UPL_COMMIT_ALLOW_ACCESS;
3819        }
3820
3821        if (upl->flags & UPL_CLEAR_DIRTY)
3822                flags |= UPL_COMMIT_CLEAR_DIRTY;
3823
3824        if (upl->flags & UPL_DEVICE_MEMORY) {
3825                xfer_size = 0;
3826        } else if ((offset + size) > upl->size) {
3827                upl_unlock(upl);
3828                return KERN_FAILURE;
3829        }
3830
3831        if (upl->flags & UPL_INTERNAL) {
3832                lite_list = (wpl_array_t) 
3833                        ((((uintptr_t)upl) + sizeof(struct upl))
3834                        + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3835        } else {
3836                lite_list = (wpl_array_t)
3837                        (((uintptr_t)upl) + sizeof(struct upl));
3838        }
3839        if (object != shadow_object)
3840                vm_object_lock(object);
3841        vm_object_lock(shadow_object);
3842
3843        shadow_internal = shadow_object->internal;
3844
3845        entry = offset/PAGE_SIZE;
3846        target_offset = (vm_object_offset_t)offset;
3847
3848        while (xfer_size) {
3849                vm_page_t       t,m;
3850                upl_page_info_t *p;
3851
3852                m = VM_PAGE_NULL;
3853
3854                if (upl->flags & UPL_LITE) {
3855                        int     pg_num;
3856
3857                        pg_num = target_offset/PAGE_SIZE;
3858
3859                        if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3860                                lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3861                                m = vm_page_lookup(shadow_object,
3862                                                   target_offset + (upl->offset - 
3863                                                                    shadow_object->paging_offset));
3864                        }
3865                }
3866                if (object->pageout) {
3867                        if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3868                                t->pageout = FALSE;
3869
3870                                if (delayed_unlock) {
3871                                        delayed_unlock = 0;
3872                                        vm_page_unlock_queues();
3873                                }
3874                                VM_PAGE_FREE(t);
3875
3876                                if (m == NULL) {
3877                                        m = vm_page_lookup(
3878                                            shadow_object, 
3879                                            target_offset + 
3880                                                object->shadow_offset);
3881                                }
3882                                if (m != VM_PAGE_NULL)
3883                                        vm_object_paging_end(m->object);
3884                        }
3885                }
3886                if (m != VM_PAGE_NULL) {
3887
3888                   clear_refmod = 0;
3889
3890                   if (upl->flags & UPL_IO_WIRE) {
3891
3892                        if (delayed_unlock == 0)
3893                                vm_page_lock_queues();
3894
3895                        vm_page_unwire(m);
3896
3897                        if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3898                                delayed_unlock = 0;
3899                                vm_page_unlock_queues();
3900                        }
3901                        if (page_list) {
3902                                page_list[entry].phys_addr = 0;
3903                        }
3904                        if (flags & UPL_COMMIT_SET_DIRTY) {
3905                                m->dirty = TRUE;
3906                        } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3907                                m->dirty = FALSE;
3908                                clear_refmod |= VM_MEM_MODIFIED;
3909                        }
3910                        if (flags & UPL_COMMIT_INACTIVATE) {
3911                                m->reference = FALSE;
3912                                clear_refmod |= VM_MEM_REFERENCED;
3913                                vm_page_deactivate(m);
3914                        }
3915                        if (clear_refmod)
3916                                pmap_clear_refmod(m->phys_page, clear_refmod);
3917
3918                        if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3919                                /*
3920                                 * We blocked access to the pages in this UPL.
3921                                 * Clear the "busy" bit and wake up any waiter
3922                                 * for this page.
3923                                 */
3924                                PAGE_WAKEUP_DONE(m);
3925                        }
3926
3927                        target_offset += PAGE_SIZE_64;
3928                        xfer_size -= PAGE_SIZE;
3929                        entry++;
3930                        continue;
3931                   }
3932                   if (delayed_unlock == 0)
3933                        vm_page_lock_queues();
3934                   /*
3935                    * make sure to clear the hardware
3936                    * modify or reference bits before
3937                    * releasing the BUSY bit on this page
3938                    * otherwise we risk losing a legitimate
3939                    * change of state
3940                    */
3941                   if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3942                        m->dirty = FALSE;
3943                        clear_refmod |= VM_MEM_MODIFIED;
3944                   }
3945                   if (flags & UPL_COMMIT_INACTIVATE)
3946                        clear_refmod |= VM_MEM_REFERENCED;
3947
3948                   if (clear_refmod)
3949                        pmap_clear_refmod(m->phys_page, clear_refmod);
3950
3951                   if (page_list) {
3952                        p = &(page_list[entry]);
3953                        if(p->phys_addr && p->pageout && !m->pageout) {
3954                                m->busy = TRUE;
3955                                m->pageout = TRUE;
3956                                vm_page_wire(m);
3957                        } else if (page_list[entry].phys_addr &&
3958                                        !p->pageout && m->pageout &&
3959                                        !m->dump_cleaning) {
3960                                m->pageout = FALSE;
3961                                m->absent = FALSE;
3962                                m->overwriting = FALSE;
3963                                vm_page_unwire(m);
3964                                PAGE_WAKEUP_DONE(m);
3965                        }
3966                        page_list[entry].phys_addr = 0;
3967                   }
3968                   m->dump_cleaning = FALSE;
3969                   if(m->laundry) {
3970                           vm_pageout_throttle_up(m);
3971                   }
3972                   if(m->pageout) {
3973                      m->cleaning = FALSE;
3974                      m->pageout = FALSE;
3975#if MACH_CLUSTER_STATS
3976                      if (m->wanted) vm_pageout_target_collisions++;
3977#endif
3978                      if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3979                              m->dirty = TRUE;
3980                      else
3981                              m->dirty = FALSE;
3982
3983                      if(m->dirty) {
3984                              vm_page_unwire(m);/* reactivates */
3985
3986                              if (upl->flags & UPL_PAGEOUT) {
3987                                      CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
3988                                      VM_STAT(reactivations++);
3989                              }
3990                              PAGE_WAKEUP_DONE(m);
3991                      } else {
3992                            vm_page_free(m);/* clears busy, etc. */
3993 
3994                            if (upl->flags & UPL_PAGEOUT) {
3995                                    CLUSTER_STAT(vm_pageout_target_page_freed++;)
3996
3997                                    if (page_list[entry].dirty)
3998                                            VM_STAT(pageouts++);
3999                            }
4000                      }
4001                      if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4002                            delayed_unlock = 0;
4003                            vm_page_unlock_queues();
4004                      }
4005                      target_offset += PAGE_SIZE_64;
4006                      xfer_size -= PAGE_SIZE;
4007                      entry++;
4008                      continue;
4009                   }
4010#if MACH_CLUSTER_STATS
4011                   m->dirty = pmap_is_modified(m->phys_page);
4012
4013                   if (m->dirty)   vm_pageout_cluster_dirtied++;
4014                   else            vm_pageout_cluster_cleaned++;
4015                   if (m->wanted)  vm_pageout_cluster_collisions++;
4016#else
4017                   m->dirty = 0;
4018#endif
4019
4020                   if((m->busy) && (m->cleaning)) {
4021                        /* the request_page_list case */
4022                        if(m->absent) {
4023                                m->absent = FALSE;
4024                                if(shadow_object->absent_count == 1)
4025                                      vm_object_absent_release(shadow_object);
4026                                else
4027                                      shadow_object->absent_count--;
4028                        }
4029                        m->overwriting = FALSE;
4030                        m->busy = FALSE;
4031                        m->dirty = FALSE;
4032                   } else if (m->overwriting) {
4033                         /* alternate request page list, write to 
4034                          * page_list case.  Occurs when the original
4035                          * page was wired at the time of the list
4036                          * request */
4037                         assert(m->wire_count != 0);
4038                         vm_page_unwire(m);/* reactivates */
4039                         m->overwriting = FALSE;
4040                   }
4041                   m->cleaning = FALSE;
4042
4043                   /* It is a part of the semantic of COPYOUT_FROM */
4044                   /* UPLs that a commit implies cache sync           */
4045                   /* between the vm page and the backing store    */
4046                   /* this can be used to strip the precious bit   */
4047                   /* as well as clean */
4048                   if (upl->flags & UPL_PAGE_SYNC_DONE)
4049                         m->precious = FALSE;
4050
4051                   if (flags & UPL_COMMIT_SET_DIRTY)
4052                        m->dirty = TRUE;
4053
4054                   if (flags & UPL_COMMIT_INACTIVATE) {
4055                        m->reference = FALSE;
4056                        vm_page_deactivate(m);
4057                   } else if (!m->active && !m->inactive) {
4058                        if (m->reference)
4059                                vm_page_activate(m);
4060                        else
4061                                vm_page_deactivate(m);
4062                   }
4063
4064                   if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4065                           /*
4066                            * We blocked access to the pages in this URL.
4067                            * Clear the "busy" bit on this page before we
4068                            * wake up any waiter.
4069                            */
4070                           m->busy = FALSE;
4071                   }
4072
4073                   /*
4074                    * Wakeup any thread waiting for the page to be un-cleaning.
4075                    */
4076                   PAGE_WAKEUP(m);
4077
4078                   if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4079                         delayed_unlock = 0;
4080                         vm_page_unlock_queues();
4081                   }
4082                }
4083                target_offset += PAGE_SIZE_64;
4084                xfer_size -= PAGE_SIZE;
4085                entry++;
4086        }
4087        if (delayed_unlock)
4088                vm_page_unlock_queues();
4089
4090        occupied = 1;
4091
4092        if (upl->flags & UPL_DEVICE_MEMORY)  {
4093                occupied = 0;
4094        } else if (upl->flags & UPL_LITE) {
4095                int     pg_num;
4096                int     i;
4097                pg_num = upl->size/PAGE_SIZE;
4098                pg_num = (pg_num + 31) >> 5;
4099                occupied = 0;
4100                for(i= 0; i<pg_num; i++) {
4101                        if(lite_list[i] != 0) {
4102                                occupied = 1;
4103                                break;
4104                        }
4105                }
4106        } else {
4107                if(queue_empty(&upl->map_object->memq)) {
4108                        occupied = 0;
4109                }
4110        }
4111
4112        if(occupied == 0) {
4113                if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4114                        *empty = TRUE;
4115                }
4116                if(object == shadow_object)
4117                        vm_object_paging_end(shadow_object);
4118        }
4119        vm_object_unlock(shadow_object);
4120        if (object != shadow_object)
4121                vm_object_unlock(object);
4122        upl_unlock(upl);
4123
4124        return KERN_SUCCESS;
4125}
4126
4127kern_return_t
4128upl_abort_range(
4129        upl_t                   upl, 
4130        upl_offset_t            offset, 
4131        upl_size_t              size,
4132        int                     error,
4133        boolean_t               *empty) 
4134{
4135        upl_size_t              xfer_size = size;
4136        vm_object_t             shadow_object;
4137        vm_object_t             object = upl->map_object;
4138        vm_object_offset_t      target_offset;
4139        int                     entry;
4140        wpl_array_t             lite_list;
4141        int                     occupied;
4142        boolean_t               shadow_internal;
4143
4144        *empty = FALSE;
4145
4146        if (upl == UPL_NULL)
4147                return KERN_INVALID_ARGUMENT;
4148
4149        if (upl->flags & UPL_IO_WIRE) {
4150                return upl_commit_range(upl, 
4151                        offset, size, 0, 
4152                        NULL, 0, empty);
4153        }
4154
4155        if(object->pageout) {
4156                shadow_object = object->shadow;
4157        } else {
4158                shadow_object = object;
4159        }
4160
4161        upl_lock(upl);
4162        if(upl->flags & UPL_DEVICE_MEMORY) {
4163                xfer_size = 0;
4164        } else if ((offset + size) > upl->size) {
4165                upl_unlock(upl);
4166                return KERN_FAILURE;
4167        }
4168        if (object != shadow_object)
4169                vm_object_lock(object);
4170        vm_object_lock(shadow_object);
4171
4172        shadow_internal = shadow_object->internal;
4173
4174        if(upl->flags & UPL_INTERNAL) {
4175                lite_list = (wpl_array_t) 
4176                        ((((uintptr_t)upl) + sizeof(struct upl))
4177                        + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4178        } else {
4179                lite_list = (wpl_array_t) 
4180                        (((uintptr_t)upl) + sizeof(struct upl));
4181        }
4182
4183        entry = offset/PAGE_SIZE;
4184        target_offset = (vm_object_offset_t)offset;
4185        while(xfer_size) {
4186                vm_page_t       t,m;
4187
4188                m = VM_PAGE_NULL;
4189                if(upl->flags & UPL_LITE) {
4190                        int     pg_num;
4191                        pg_num = target_offset/PAGE_SIZE;
4192                        if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4193                                lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4194                                m = vm_page_lookup(shadow_object,
4195                                        target_offset + (upl->offset - 
4196                                                shadow_object->paging_offset));
4197                        }
4198                }
4199                if(object->pageout) {
4200                        if ((t = vm_page_lookup(object, target_offset))
4201                                                                != NULL) {
4202                                t->pageout = FALSE;
4203                                VM_PAGE_FREE(t);
4204                                if(m == NULL) {
4205                                        m = vm_page_lookup(
4206                                            shadow_object, 
4207                                            target_offset + 
4208                                                object->shadow_offset);
4209                                }
4210                                if(m != VM_PAGE_NULL)
4211                                        vm_object_paging_end(m->object);
4212                        }
4213                }
4214                if(m != VM_PAGE_NULL) {
4215                        vm_page_lock_queues();
4216                        if(m->absent) {
4217                                boolean_t must_free = TRUE;
4218
4219                                /* COPYOUT = FALSE case */
4220                                /* check for error conditions which must */
4221                                /* be passed back to the pages customer  */
4222                                if(error & UPL_ABORT_RESTART) {
4223                                        m->restart = TRUE;
4224                                        m->absent = FALSE;
4225                                        vm_object_absent_release(m->object);
4226                                        m->page_error = KERN_MEMORY_ERROR;
4227                                        m->error = TRUE;
4228                                        must_free = FALSE;
4229                                } else if(error & UPL_ABORT_UNAVAILABLE) {
4230                                        m->restart = FALSE;
4231                                        m->unusual = TRUE;
4232                                        must_free = FALSE;
4233                                } else if(error & UPL_ABORT_ERROR) {
4234                                        m->restart = FALSE;
4235                                        m->absent = FALSE;
4236                                        vm_object_absent_release(m->object);
4237                                        m->page_error = KERN_MEMORY_ERROR;
4238                                        m->error = TRUE;
4239                                        must_free = FALSE;
4240                                }
4241
4242                                /*
4243                                 * ENCRYPTED SWAP:
4244                                 * If the page was already encrypted,
4245                                 * we don't really need to decrypt it
4246                                 * now.  It will get decrypted later,
4247                                 * on demand, as soon as someone needs
4248                                 * to access its contents.
4249                                 */
4250
4251                                m->cleaning = FALSE;
4252                                m->overwriting = FALSE;
4253                                PAGE_WAKEUP_DONE(m);
4254
4255                                if (must_free == TRUE) {
4256                                        vm_page_free(m);
4257                                } else {
4258                                        vm_page_activate(m);
4259                                }
4260                                vm_page_unlock_queues();
4261
4262                                target_offset += PAGE_SIZE_64;
4263                                xfer_size -= PAGE_SIZE;
4264                                entry++;
4265                                continue;
4266                        }
4267                        /*                          
4268                        * Handle the trusted pager throttle.
4269                        */                     
4270                        if (m->laundry) {
4271                                vm_pageout_throttle_up(m);
4272                        }         
4273                        if(m->pageout) {
4274                                assert(m->busy);
4275                                assert(m->wire_count == 1);
4276                                m->pageout = FALSE;
4277                                vm_page_unwire(m);
4278                        }
4279                        m->dump_cleaning = FALSE;
4280                        m->cleaning = FALSE;
4281                        m->overwriting = FALSE;
4282#if     MACH_PAGEMAP
4283                        vm_external_state_clr(
4284                                m->object->existence_map, m->offset);
4285#endif  /* MACH_PAGEMAP */
4286                        if(error & UPL_ABORT_DUMP_PAGES) {
4287                                vm_page_free(m);
4288                                pmap_disconnect(m->phys_page);
4289                        } else {
4290                                PAGE_WAKEUP_DONE(m);
4291                        }
4292                        vm_page_unlock_queues();
4293                }
4294                target_offset += PAGE_SIZE_64;
4295                xfer_size -= PAGE_SIZE;
4296                entry++;
4297        }
4298        occupied = 1;
4299        if (upl->flags & UPL_DEVICE_MEMORY)  {
4300                occupied = 0;
4301        } else if (upl->flags & UPL_LITE) {
4302                int     pg_num;
4303                int     i;
4304                pg_num = upl->size/PAGE_SIZE;
4305                pg_num = (pg_num + 31) >> 5;
4306                occupied = 0;
4307                for(i= 0; i<pg_num; i++) {
4308                        if(lite_list[i] != 0) {
4309                                occupied = 1;
4310                                break;
4311                        }
4312                }
4313        } else {
4314                if(queue_empty(&upl->map_object->memq)) {
4315                        occupied = 0;
4316                }
4317        }
4318
4319        if(occupied == 0) {
4320                if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4321                        *empty = TRUE;
4322                }
4323                if(object == shadow_object)
4324                        vm_object_paging_end(shadow_object);
4325        }
4326        vm_object_unlock(shadow_object);
4327        if (object != shadow_object)
4328                vm_object_unlock(object);
4329
4330        upl_unlock(upl);
4331
4332        return KERN_SUCCESS;
4333}
4334
4335kern_return_t
4336upl_abort(
4337        upl_t   upl,
4338        int     error)
4339{
4340        vm_object_t             object = NULL;
4341        vm_object_t             shadow_object = NULL;
4342        vm_object_offset_t      offset;
4343        vm_object_offset_t      shadow_offset;
4344        vm_object_offset_t      target_offset;
4345        upl_size_t              i;
4346        wpl_array_t             lite_list;
4347        vm_page_t               t,m;
4348        int                     occupied;
4349        boolean_t               shadow_internal;
4350
4351        if (upl == UPL_NULL)
4352                return KERN_INVALID_ARGUMENT;
4353
4354        if (upl->flags & UPL_IO_WIRE) {
4355                boolean_t       empty;
4356                return upl_commit_range(upl, 
4357                        0, upl->size, 0, 
4358                        NULL, 0, &empty);
4359        }
4360
4361        upl_lock(upl);
4362        if(upl->flags & UPL_DEVICE_MEMORY) {
4363                upl_unlock(upl);
4364                return KERN_SUCCESS;
4365        }
4366
4367        object = upl->map_object;
4368
4369        if (object == NULL) {
4370                panic("upl_abort: upl object is not backed by an object");
4371                upl_unlock(upl);
4372                return KERN_INVALID_ARGUMENT;
4373        }
4374
4375        if(object->pageout) {
4376                shadow_object = object->shadow;
4377                shadow_offset = object->shadow_offset;
4378        } else {
4379                shadow_object = object;
4380                shadow_offset = upl->offset - object->paging_offset;
4381        }
4382
4383        if(upl->flags & UPL_INTERNAL) {
4384                lite_list = (wpl_array_t)
4385                        ((((uintptr_t)upl) + sizeof(struct upl))
4386                        + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4387        } else {
4388                lite_list = (wpl_array_t)
4389                        (((uintptr_t)upl) + sizeof(struct upl));
4390        }
4391        offset = 0;
4392
4393        if (object != shadow_object)
4394                vm_object_lock(object);
4395        vm_object_lock(shadow_object);
4396
4397        shadow_internal = shadow_object->internal;
4398
4399        for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4400                m = VM_PAGE_NULL;
4401                target_offset = offset + shadow_offset;
4402                if(upl->flags & UPL_LITE) {
4403                        int     pg_num;
4404                        pg_num = offset/PAGE_SIZE;
4405                        if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4406                                lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4407                                m = vm_page_lookup(
4408                                        shadow_object, target_offset);
4409                        }
4410                }
4411                if(object->pageout) {
4412                        if ((t = vm_page_lookup(object, offset)) != NULL) {
4413                                t->pageout = FALSE;
4414                                VM_PAGE_FREE(t);
4415                                if(m == NULL) {
4416                                        m = vm_page_lookup(
4417                                            shadow_object, target_offset);
4418                                }
4419                                if(m != VM_PAGE_NULL)
4420                                        vm_object_paging_end(m->object);
4421                        }
4422                }
4423                if(m != VM_PAGE_NULL) {
4424                        vm_page_lock_queues();
4425                        if(m->absent) {
4426                                boolean_t must_free = TRUE;
4427
4428                                /* COPYOUT = FALSE case */
4429                                /* check for error conditions which must */
4430                                /* be passed back to the pages customer  */
4431                                if(error & UPL_ABORT_RESTART) {
4432                                        m->restart = TRUE;
4433                                        m->absent = FALSE;
4434                                        vm_object_absent_release(m->object);
4435                                        m->page_error = KERN_MEMORY_ERROR;
4436                                        m->error = TRUE;
4437                                        must_free = FALSE;
4438                                } else if(error & UPL_ABORT_UNAVAILABLE) {
4439                                        m->restart = FALSE;
4440                                        m->unusual = TRUE;
4441                                        must_free = FALSE;
4442                                } else if(error & UPL_ABORT_ERROR) {
4443                                        m->restart = FALSE;
4444                                        m->absent = FALSE;
4445                                        vm_object_absent_release(m->object);
4446                                        m->page_error = KERN_MEMORY_ERROR;
4447                                        m->error = TRUE;
4448                                        must_free = FALSE;
4449                                }
4450
4451                                /*
4452                                 * ENCRYPTED SWAP:
4453                                 * If the page was already encrypted,
4454                                 * we don't really need to decrypt it
4455                                 * now.  It will get decrypted later,
4456                                 * on demand, as soon as someone needs
4457                                 * to access its contents.
4458                                 */
4459
4460                                m->cleaning = FALSE;
4461                                m->overwriting = FALSE;
4462                                PAGE_WAKEUP_DONE(m);
4463
4464                                if (must_free == TRUE) {
4465                                        vm_page_free(m);
4466                                } else {
4467                                        vm_page_activate(m);
4468                                }
4469                                vm_page_unlock_queues();
4470                                continue;
4471                        }
4472                        /*                          
4473                         * Handle the trusted pager throttle.
4474                         */                     
4475                        if (m->laundry) { 
4476                                vm_pageout_throttle_up(m);
4477                        }         
4478                        if(m->pageout) {
4479                                assert(m->busy);
4480                                assert(m->wire_count == 1);
4481                                m->pageout = FALSE;
4482                                vm_page_unwire(m);
4483                        }
4484                        m->dump_cleaning = FALSE;
4485                        m->cleaning = FALSE;
4486                        m->overwriting = FALSE;
4487#if     MACH_PAGEMAP
4488                        vm_external_state_clr(
4489                                m->object->existence_map, m->offset);
4490#endif  /* MACH_PAGEMAP */
4491                        if(error & UPL_ABORT_DUMP_PAGES) {
4492                                vm_page_free(m);
4493                                pmap_disconnect(m->phys_page);
4494                        } else {
4495                                PAGE_WAKEUP_DONE(m);
4496                        }
4497                        vm_page_unlock_queues();
4498                }
4499        }
4500        occupied = 1;
4501        if (upl->flags & UPL_DEVICE_MEMORY)  {
4502                occupied = 0;
4503        } else if (upl->flags & UPL_LITE) {
4504                int     pg_num;
4505                int     j;
4506                pg_num = upl->size/PAGE_SIZE;
4507                pg_num = (pg_num + 31) >> 5;
4508                occupied = 0;
4509                for(j= 0; j<pg_num; j++) {
4510                        if(lite_list[j] != 0) {
4511                                occupied = 1;
4512                                break;
4513                        }
4514                }
4515        } else {
4516                if(queue_empty(&upl->map_object->memq)) {
4517                        occupied = 0;
4518                }
4519        }
4520
4521        if(occupied == 0) {
4522                if(object == shadow_object)
4523                        vm_object_paging_end(shadow_object);
4524        }
4525        vm_object_unlock(shadow_object);
4526        if (object != shadow_object)
4527                vm_object_unlock(object);
4528
4529        upl_unlock(upl);
4530        return KERN_SUCCESS;
4531}
4532
4533/* an option on commit should be wire */
4534kern_return_t
4535upl_commit(
4536        upl_t                   upl,
4537        upl_page_info_t         *page_list,
4538        mach_msg_type_number_t  count)
4539{
4540        if (upl == UPL_NULL)
4541                return KERN_INVALID_ARGUMENT;
4542
4543        if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4544                boolean_t       empty;
4545                return upl_commit_range(upl, 0, upl->size, 0, 
4546                                        page_list, count, &empty);
4547        }
4548
4549        if (count == 0)
4550                page_list = NULL;
4551
4552        upl_lock(upl);
4553        if (upl->flags & UPL_DEVICE_MEMORY)
4554                page_list = NULL;
4555
4556        if (upl->flags & UPL_ENCRYPTED) {
4557                /*
4558                 * ENCRYPTED SWAP:
4559                 * This UPL was encrypted, but we don't need
4560                 * to decrypt here.  We'll decrypt each page
4561                 * later, on demand, as soon as someone needs
4562                 * to access the page's contents.
4563                 */
4564        }
4565
4566        if ((upl->flags & UPL_CLEAR_DIRTY) ||
4567                (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4568                vm_object_t     shadow_object = upl->map_object->shadow;
4569                vm_object_t     object = upl->map_object;
4570                vm_object_offset_t target_offset;
4571                upl_size_t      xfer_end;
4572                int             entry;
4573
4574                vm_page_t       t, m;
4575                upl_page_info_t *p;
4576
4577                if (object != shadow_object)
4578                        vm_object_lock(object);
4579                vm_object_lock(shadow_object);
4580
4581                entry = 0;
4582                target_offset = object->shadow_offset;
4583                xfer_end = upl->size + object->shadow_offset;
4584
4585                while(target_offset < xfer_end) {
4586
4587                        if ((t = vm_page_lookup(object, 
4588                                target_offset - object->shadow_offset))
4589                                == NULL) {
4590                                target_offset += PAGE_SIZE_64;
4591                                entry++;
4592                                continue;
4593                        }
4594
4595                        m = vm_page_lookup(shadow_object, target_offset);
4596                        if(m != VM_PAGE_NULL) {
4597                            /*
4598                             * ENCRYPTED SWAP:
4599                             * If this page was encrypted, we
4600                             * don't need to decrypt it here.
4601                             * We'll decrypt it later, on demand,
4602                             * as soon as someone needs to access
4603                             * its contents.
4604                             */
4605
4606                            if (upl->flags & UPL_CLEAR_DIRTY) {
4607                                pmap_clear_modify(m->phys_page);
4608                                m->dirty = FALSE;
4609                            }
4610                            /* It is a part of the semantic of */
4611                            /* COPYOUT_FROM UPLs that a commit */
4612                            /* implies cache sync between the  */
4613                            /* vm page and the backing store   */
4614                            /* this can be used to strip the   */
4615                            /* precious bit as well as clean   */
4616                            if (upl->flags & UPL_PAGE_SYNC_DONE)
4617                                m->precious = FALSE;
4618
4619                           if(page_list) {
4620                                p = &(page_list[entry]);
4621                                if(page_list[entry].phys_addr &&
4622                                                p->pageout && !m->pageout) {
4623                                        vm_page_lock_queues();
4624                                        m->busy = TRUE;
4625                                        m->pageout = TRUE;
4626                                        vm_page_wire(m);
4627                                        vm_page_unlock_queues();
4628                                } else if (page_list[entry].phys_addr &&
4629                                                !p->pageout && m->pageout &&
4630                                                !m->dump_cleaning) {
4631                                        vm_page_lock_queues();
4632                                        m->pageout = FALSE;
4633                                        m->absent = FALSE;
4634                                        m->overwriting = FALSE;
4635                                        vm_page_unwire(m);
4636                                        PAGE_WAKEUP_DONE(m);
4637                                        vm_page_unlock_queues();
4638                                }
4639                                page_list[entry].phys_addr = 0;
4640                           }
4641                        }
4642                        target_offset += PAGE_SIZE_64;
4643                        entry++;
4644                }
4645                vm_object_unlock(shadow_object);
4646                if (object != shadow_object)
4647                        vm_object_unlock(object);
4648
4649        }
4650        if (upl->flags & UPL_DEVICE_MEMORY)  {
4651                vm_object_lock(upl->map_object->shadow);
4652                if(upl->map_object == upl->map_object->shadow)
4653                        vm_object_paging_end(upl->map_object->shadow);
4654                vm_object_unlock(upl->map_object->shadow);
4655        }
4656        upl_unlock(upl);
4657        return KERN_SUCCESS;
4658}
4659
4660
4661
4662kern_return_t
4663vm_object_iopl_request(
4664        vm_object_t             object,
4665        vm_object_offset_t      offset,
4666        upl_size_t              size,
4667        upl_t                   *upl_ptr,
4668        upl_page_info_array_t   user_page_list,
4669        unsigned int            *page_list_count,
4670        int                     cntrl_flags)
4671{
4672        vm_page_t               dst_page;
4673        vm_object_offset_t      dst_offset = offset;
4674        upl_size_t              xfer_size = size;
4675        upl_t                   upl = NULL;
4676        unsigned int            entry;
4677        wpl_array_t             lite_list = NULL;
4678        int                     page_field_size;
4679        int                     delayed_unlock = 0;
4680        int                     no_zero_fill = FALSE;
4681        vm_page_t               alias_page = NULL;
4682        kern_return_t           ret;
4683        vm_prot_t               prot;
4684
4685
4686        if (cntrl_flags & ~UPL_VALID_FLAGS) {
4687                /*
4688                 * For forward compatibility's sake,
4689                 * reject any unknown flag.
4690                 */
4691                return KERN_INVALID_VALUE;
4692        }
4693
4694        if (cntrl_flags & UPL_ENCRYPT) {
4695                /*
4696                 * ENCRYPTED SWAP:
4697                 * The paging path doesn't use this interface,
4698                 * so we don't support the UPL_ENCRYPT flag
4699                 * here.  We won't encrypt the pages.
4700                 */
4701                assert(! (cntrl_flags & UPL_ENCRYPT));
4702        }
4703
4704        if (cntrl_flags & UPL_NOZEROFILL)
4705                no_zero_fill = TRUE;
4706
4707        if (cntrl_flags & UPL_COPYOUT_FROM)
4708                prot = VM_PROT_READ;
4709        else
4710                prot = VM_PROT_READ | VM_PROT_WRITE;
4711
4712        if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4713                size = MAX_UPL_TRANSFER * page_size;
4714        }
4715
4716        if(cntrl_flags & UPL_SET_INTERNAL)
4717                if(page_list_count != NULL)
4718                        *page_list_count = MAX_UPL_TRANSFER;
4719        if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4720           ((page_list_count != NULL) && (*page_list_count != 0)
4721                                && *page_list_count < (size/page_size)))
4722                return KERN_INVALID_ARGUMENT;
4723
4724        if((!object->internal) && (object->paging_offset != 0))
4725                panic("vm_object_upl_request: vnode object with non-zero paging offset\n");
4726
4727        if(object->phys_contiguous) {
4728                /* No paging operations are possible against this memory */
4729                /* and so no need for map object, ever */
4730                cntrl_flags |= UPL_SET_LITE;
4731        }
4732
4733        if(upl_ptr) {
4734                if(cntrl_flags & UPL_SET_INTERNAL) {
4735                        if(cntrl_flags & UPL_SET_LITE) {
4736                                upl = upl_create(
4737                                        UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4738                                        size);
4739                                user_page_list = (upl_page_info_t *)
4740                                   (((uintptr_t)upl) + sizeof(struct upl));
4741                                lite_list = (wpl_array_t)
4742                                        (((uintptr_t)user_page_list) + 
4743                                        ((size/PAGE_SIZE) * 
4744                                                sizeof(upl_page_info_t)));
4745                                page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4746                                page_field_size = 
4747                                        (page_field_size + 3) & 0xFFFFFFFC;
4748                                bzero((char *)lite_list, page_field_size);
4749                                upl->flags = 
4750                                        UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4751                        } else {
4752                                upl = upl_create(UPL_CREATE_INTERNAL, size);
4753                                user_page_list = (upl_page_info_t *)
4754                                        (((uintptr_t)upl) 
4755                                                + sizeof(struct upl));
4756                                upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4757                        }
4758                } else {
4759                        if(cntrl_flags & UPL_SET_LITE) {
4760                                upl = upl_create(UPL_CREATE_LITE, size);
4761                                lite_list = (wpl_array_t)
4762                                   (((uintptr_t)upl) + sizeof(struct upl));
4763                                page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4764                                page_field_size = 
4765                                        (page_field_size + 3) & 0xFFFFFFFC;
4766                                bzero((char *)lite_list, page_field_size);
4767                                upl->flags = UPL_LITE | UPL_IO_WIRE;
4768                        } else {
4769                                upl = upl_create(UPL_CREATE_EXTERNAL, size);
4770                                upl->flags = UPL_IO_WIRE;
4771                        }
4772                }
4773
4774                if(object->phys_contiguous) {
4775                        upl->map_object = object;
4776                        /* don't need any shadow mappings for this one */
4777                        /* since it is already I/O memory */
4778                        upl->flags |= UPL_DEVICE_MEMORY;
4779
4780                        vm_object_lock(object);
4781                        vm_object_paging_begin(object);
4782                        vm_object_unlock(object);
4783
4784                        /* paging in progress also protects the paging_offset */
4785                        upl->offset = offset + object->paging_offset;
4786                        upl->size = size;
4787                        *upl_ptr = upl;
4788                        if(user_page_list) {
4789                                user_page_list[0].phys_addr = 
4790                                  (offset + object->shadow_offset)>>PAGE_SHIFT;
4791                                user_page_list[0].device = TRUE;
4792                        }
4793
4794                        if(page_list_count != NULL) {
4795                                if (upl->flags & UPL_INTERNAL) {
4796                                        *page_list_count = 0;
4797                                } else {
4798                                        *page_list_count = 1;
4799                                }
4800                        }
4801                        return KERN_SUCCESS;
4802                }
4803                if(user_page_list)
4804                        user_page_list[0].device = FALSE;
4805                        
4806                if(cntrl_flags & UPL_SET_LITE) {
4807                        upl->map_object = object;
4808                } else {
4809                        upl->map_object = vm_object_allocate(size);
4810                        vm_object_lock(upl->map_object);
4811                        upl->map_object->shadow = object;
4812                        upl->map_object->pageout = TRUE;
4813                        upl->map_object->can_persist = FALSE;
4814                        upl->map_object->copy_strategy = 
4815                                        MEMORY_OBJECT_COPY_NONE;
4816                        upl->map_object->shadow_offset = offset;
4817                        upl->map_object->wimg_bits = object->wimg_bits;
4818                        vm_object_unlock(upl->map_object);
4819                }
4820        }
4821        vm_object_lock(object);
4822        vm_object_paging_begin(object);
4823
4824        if (!object->phys_contiguous) {
4825                /* Protect user space from future COW operations */
4826                object->true_share = TRUE;
4827                if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4828                        object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4829        }
4830
4831        /* we can lock the upl offset now that paging_in_progress is set */
4832        if(upl_ptr) {
4833                upl->size = size;
4834                upl->offset = offset + object->paging_offset;
4835                *upl_ptr = upl;
4836#ifdef UPL_DEBUG
4837                queue_enter(&object->uplq, upl, upl_t, uplq);
4838#endif /* UPL_DEBUG */
4839        }
4840
4841        if (cntrl_flags & UPL_BLOCK_ACCESS) {
4842                /*
4843                 * The user requested that access to the pages in this URL
4844                 * be blocked until the UPL is commited or aborted.
4845                 */
4846                upl->flags |= UPL_ACCESS_BLOCKED;
4847        }
4848
4849        entry = 0;
4850        while (xfer_size) {
4851                if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4852                        if (delayed_unlock) {
4853                                delayed_unlock = 0;
4854                                vm_page_unlock_queues();
4855                        }
4856                        vm_object_unlock(object);
4857                        VM_PAGE_GRAB_FICTITIOUS(alias_page);
4858                        vm_object_lock(object);
4859                }
4860                dst_page = vm_page_lookup(object, dst_offset);
4861
4862                /*
4863                 * ENCRYPTED SWAP:
4864                 * If the page is encrypted, we need to decrypt it,
4865                 * so force a soft page fault.
4866                 */
4867                if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4868                    (dst_page->encrypted) ||
4869                    (dst_page->unusual && (dst_page->error || 
4870                                           dst_page->restart ||
4871                                           dst_page->absent ||
4872                                           dst_page->fictitious ||
4873                                           (prot & dst_page->page_lock)))) {
4874                        vm_fault_return_t       result;
4875                   do {
4876                        vm_page_t       top_page;
4877                        kern_return_t   error_code;
4878                        int             interruptible;
4879
4880                        vm_object_offset_t      lo_offset = offset;
4881                        vm_object_offset_t      hi_offset = offset + size;
4882
4883
4884                        if (delayed_unlock) {
4885                                delayed_unlock = 0;
4886                                vm_page_unlock_queues();
4887                        }
4888
4889                        if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4890                                interruptible = THREAD_ABORTSAFE;
4891                        } else {
4892                                interruptible = THREAD_UNINT;
4893                        }
4894
4895                        result = vm_fault_page(object, dst_offset,
4896                                prot | VM_PROT_WRITE, FALSE, 
4897                                interruptible,
4898                                lo_offset, hi_offset,
4899                                VM_BEHAVIOR_SEQUENTIAL,
4900                                &prot, &dst_page, &top_page,
4901                                (int *)0,
4902                                &error_code, no_zero_fill, FALSE, NULL, 0);
4903
4904                        switch(result) {
4905                        case VM_FAULT_SUCCESS:
4906
4907                                PAGE_WAKEUP_DONE(dst_page);
4908
4909                                /*
4910                                 *      Release paging references and
4911                                 *      top-level placeholder page, if any.
4912                                 */
4913
4914                                if(top_page != VM_PAGE_NULL) {
4915                                        vm_object_t local_object;
4916                                        local_object = 
4917                                                top_page->object;
4918                                        if(top_page->object 
4919                                                != dst_page->object) {
4920                                                vm_object_lock(
4921                                                        local_object);
4922                                                VM_PAGE_FREE(top_page);
4923                                                vm_object_paging_end(
4924                                                        local_object);
4925                                                vm_object_unlock(
4926                                                        local_object);
4927                                        } else {
4928                                                VM_PAGE_FREE(top_page);
4929                                                vm_object_paging_end(
4930                                                        local_object);
4931                                        }
4932                                }
4933
4934                                break;
4935                        
4936                                
4937                        case VM_FAULT_RETRY:
4938                                vm_object_lock(object);
4939                                vm_object_paging_begin(object);
4940                                break;
4941
4942                        case VM_FAULT_FICTITIOUS_SHORTAGE:
4943                                vm_page_more_fictitious();
4944                                vm_object_lock(object);
4945                                vm_object_paging_begin(object);
4946                                break;
4947
4948                        case VM_FAULT_MEMORY_SHORTAGE:
4949                                if (vm_page_wait(interruptible)) {
4950                                        vm_object_lock(object);
4951                                        vm_object_paging_begin(object);
4952                                        break;
4953                                }
4954                                /* fall thru */
4955
4956                        case VM_FAULT_INTERRUPTED:
4957                                error_code = MACH_SEND_INTERRUPTED;
4958                        case VM_FAULT_MEMORY_ERROR:
4959                                ret = (error_code ? error_code:
4960                                        KERN_MEMORY_ERROR);
4961                                vm_object_lock(object);
4962                                for(; offset < dst_offset;
4963                                                offset += PAGE_SIZE) {
4964                                   dst_page = vm_page_lookup(
4965                                                object, offset);
4966                                   if(dst_page == VM_PAGE_NULL)
4967                                        panic("vm_object_iopl_request: Wired pages missing. \n");
4968                                   vm_page_lock_queues();
4969                                   vm_page_unwire(dst_page);
4970                                   vm_page_unlock_queues();
4971                                   VM_STAT(reactivations++);
4972                                }
4973                                vm_object_unlock(object);
4974                                upl_destroy(upl);
4975                                return ret;
4976                        }
4977                   } while ((result != VM_FAULT_SUCCESS) 
4978                                || (result == VM_FAULT_INTERRUPTED));
4979                }
4980                if (delayed_unlock == 0)
4981                        vm_page_lock_queues();
4982                vm_page_wire(dst_page);
4983
4984                if (cntrl_flags & UPL_BLOCK_ACCESS) {
4985                        /*
4986                         * Mark the page "busy" to block any future page fault
4987                         * on this page.  We'll also remove the mapping
4988                         * of all these pages before leaving this routine.
4989                         */
4990                        assert(!dst_page->fictitious);
4991                        dst_page->busy = TRUE;
4992                }
4993
4994                if (upl_ptr) {
4995                        if (cntrl_flags & UPL_SET_LITE) {
4996                                int     pg_num;
4997                                pg_num = (dst_offset-offset)/PAGE_SIZE;
4998                                lite_list[pg_num>>5] |= 1 << (pg_num & 31);
4999                        } else {
5000                                /*
5001                                 * Convert the fictitious page to a 
5002                                 * private shadow of the real page.
5003                                 */
5004                                assert(alias_page->fictitious);
5005                                alias_page->fictitious = FALSE;
5006                                alias_page->private = TRUE;
5007                                alias_page->pageout = TRUE;
5008                                alias_page->phys_page = dst_page->phys_page;
5009                                vm_page_wire(alias_page);
5010
5011                                vm_page_insert(alias_page, 
5012                                        upl->map_object, size - xfer_size);
5013                                assert(!alias_page->wanted);
5014                                alias_page->busy = FALSE;
5015                                alias_page->absent = FALSE;
5016                        }
5017
5018                        /* expect the page to be used */
5019                        dst_page->reference = TRUE;
5020
5021                        if (!(cntrl_flags & UPL_COPYOUT_FROM))
5022                                dst_page->dirty = TRUE;
5023                        alias_page = NULL;
5024
5025                        if (user_page_list) {
5026                                user_page_list[entry].phys_addr
5027                                        = dst_page->phys_page;
5028                                user_page_list[entry].dirty =
5029                                                dst_page->dirty;
5030                                user_page_list[entry].pageout =
5031                                                dst_page->pageout;
5032                                user_page_list[entry].absent =
5033                                                dst_page->absent;
5034                                user_page_list[entry].precious =
5035                                                dst_page->precious;
5036                        }
5037                }
5038                if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5039                        delayed_unlock = 0;
5040                        vm_page_unlock_queues();
5041                }
5042                entry++;
5043                dst_offset += PAGE_SIZE_64;
5044                xfer_size -= PAGE_SIZE;
5045        }
5046        if (delayed_unlock)
5047                vm_page_unlock_queues();
5048
5049        if (upl->flags & UPL_INTERNAL) {
5050                if(page_list_count != NULL)
5051                        *page_list_count = 0;
5052        } else if (*page_list_count > entry) {
5053                if(page_list_count != NULL)
5054                        *page_list_count = entry;
5055        }
5056
5057        if (alias_page != NULL) {
5058                vm_page_lock_queues();
5059                vm_page_free(alias_page);
5060                vm_page_unlock_queues();
5061        }
5062
5063        vm_object_unlock(object);
5064
5065        if (cntrl_flags & UPL_BLOCK_ACCESS) {
5066                /*
5067                 * We've marked all the pages "busy" so that future
5068                 * page faults will block.
5069                 * Now remove the mapping for these pages, so that they
5070                 * can't be accessed without causing a page fault.
5071                 */
5072                vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5073                                       PMAP_NULL, 0, VM_PROT_NONE);
5074        }
5075
5076        return KERN_SUCCESS;
5077}
5078
5079kern_return_t
5080upl_transpose(
5081        upl_t           upl1,
5082        upl_t           upl2)
5083{
5084        kern_return_t           retval;
5085        boolean_t               upls_locked;
5086        vm_object_t             object1, object2;
5087
5088        if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5089                return KERN_INVALID_ARGUMENT;
5090        }
5091        
5092        upls_locked = FALSE;
5093
5094        /*
5095         * Since we need to lock both UPLs at the same time,
5096         * avoid deadlocks by always taking locks in the same order.
5097         */
5098        if (upl1 < upl2) {
5099                upl_lock(upl1);
5100                upl_lock(upl2);
5101        } else {
5102                upl_lock(upl2);
5103                upl_lock(upl1);
5104        }
5105        upls_locked = TRUE;     /* the UPLs will need to be unlocked */
5106
5107        object1 = upl1->map_object;
5108        object2 = upl2->map_object;
5109
5110        if (upl1->offset != 0 || upl2->offset != 0 ||
5111            upl1->size != upl2->size) {
5112                /*
5113                 * We deal only with full objects, not subsets.
5114                 * That's because we exchange the entire backing store info
5115                 * for the objects: pager, resident pages, etc...  We can't do
5116                 * only part of it.
5117                 */
5118                retval = KERN_INVALID_VALUE;
5119                goto done;
5120        }
5121
5122        /*
5123         * Tranpose the VM objects' backing store.
5124         */
5125        retval = vm_object_transpose(object1, object2,
5126                                     (vm_object_size_t) upl1->size);
5127
5128        if (retval == KERN_SUCCESS) {
5129                /*
5130                 * Make each UPL point to the correct VM object, i.e. the
5131                 * object holding the pages that the UPL refers to...
5132                 */
5133                upl1->map_object = object2;
5134                upl2->map_object = object1;
5135        }
5136
5137done:
5138        /*
5139         * Cleanup.
5140         */
5141        if (upls_locked) {
5142                upl_unlock(upl1);
5143                upl_unlock(upl2);
5144                upls_locked = FALSE;
5145        }
5146
5147        return retval;
5148}
5149
5150/*
5151 * ENCRYPTED SWAP:
5152 *
5153 * Rationale:  the user might have some encrypted data on disk (via
5154 * FileVault or any other mechanism).  That data is then decrypted in
5155 * memory, which is safe as long as the machine is secure.  But that
5156 * decrypted data in memory could be paged out to disk by the default
5157 * pager.  The data would then be stored on disk in clear (not encrypted)
5158 * and it could be accessed by anyone who gets physical access to the
5159 * disk (if the laptop or the disk gets stolen for example).  This weakens
5160 * the security offered by FileVault.
5161 *
5162 * Solution:  the default pager will optionally request that all the
5163 * pages it gathers for pageout be encrypted, via the UPL interfaces,
5164 * before it sends this UPL to disk via the vnode_pageout() path.
5165 * 
5166 * Notes:
5167 * 
5168 * To avoid disrupting the VM LRU algorithms, we want to keep the
5169 * clean-in-place mechanisms, which allow us to send some extra pages to 
5170 * swap (clustering) without actually removing them from the user's
5171 * address space.  We don't want the user to unknowingly access encrypted
5172 * data, so we have to actually remove the encrypted pages from the page
5173 * table.  When the user accesses the data, the hardware will fail to
5174 * locate the virtual page in its page table and will trigger a page
5175 * fault.  We can then decrypt the page and enter it in the page table
5176 * again.  Whenever we allow the user to access the contents of a page,
5177 * we have to make sure it's not encrypted.
5178 *
5179 * 
5180 */
5181/*
5182 * ENCRYPTED SWAP:
5183 * Reserve of virtual addresses in the kernel address space.
5184 * We need to map the physical pages in the kernel, so that we
5185 * can call the encryption/decryption routines with a kernel
5186 * virtual address.  We keep this pool of pre-allocated kernel
5187 * virtual addresses so that we don't have to scan the kernel's
5188 * virtaul address space each time we need to encrypt or decrypt
5189 * a physical page.
5190 * It would be nice to be able to encrypt and decrypt in physical
5191 * mode but that might not always be more efficient...
5192 */
5193decl_simple_lock_data(,vm_paging_lock)
5194#define VM_PAGING_NUM_PAGES     64
5195vm_map_offset_t vm_paging_base_address = 0;
5196boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5197int             vm_paging_max_index = 0;
5198unsigned long   vm_paging_no_kernel_page = 0;
5199unsigned long   vm_paging_objects_mapped = 0;
5200unsigned long   vm_paging_pages_mapped = 0;
5201unsigned long   vm_paging_objects_mapped_slow = 0;
5202unsigned long   vm_paging_pages_mapped_slow = 0;
5203
5204/*
5205 * ENCRYPTED SWAP:
5206 * vm_paging_map_object:
5207 *      Maps part of a VM object's pages in the kernel
5208 *      virtual address space, using the pre-allocated
5209 *      kernel virtual addresses, if possible.
5210 * Context:
5211 *      The VM object is locked.  This lock will get
5212 *      dropped and re-acquired though.
5213 */
5214kern_return_t
5215vm_paging_map_object(
5216        vm_map_offset_t         *address,
5217        vm_page_t               page,
5218        vm_object_t             object,
5219        vm_object_offset_t      offset,
5220        vm_map_size_t           *size)
5221{
5222        kern_return_t           kr;
5223        vm_map_offset_t         page_map_offset;
5224        vm_map_size_t           map_size;
5225        vm_object_offset_t      object_offset;
5226#ifdef __ppc__
5227        int                     i;
5228        vm_map_entry_t          map_entry;
5229#endif /* __ppc__ */
5230
5231
5232#ifdef __ppc__
5233        if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5234                /*
5235                 * Optimization for the PowerPC.
5236                 * Use one of the pre-allocated kernel virtual addresses
5237                 * and just enter the VM page in the kernel address space
5238                 * at that virtual address.
5239                 */
5240                vm_object_unlock(object);
5241                simple_lock(&vm_paging_lock);
5242
5243                if (vm_paging_base_address == 0) {
5244                        /*
5245                         * Initialize our pool of pre-allocated kernel
5246                         * virtual addresses.
5247                         */
5248                        simple_unlock(&vm_paging_lock);
5249                        page_map_offset = 0;
5250                        kr = vm_map_find_space(kernel_map,
5251                                               &page_map_offset,
5252                                               VM_PAGING_NUM_PAGES * PAGE_SIZE,
5253                                               0,
5254                                               &map_entry);
5255                        if (kr != KERN_SUCCESS) {
5256                                panic("vm_paging_map_object: "
5257                                      "kernel_map full\n");
5258                        }
5259                        map_entry->object.vm_object = kernel_object;
5260                        map_entry->offset =
5261                                page_map_offset - VM_MIN_KERNEL_ADDRESS;
5262                        vm_object_reference(kernel_object);
5263                        vm_map_unlock(kernel_map);
5264
5265                        simple_lock(&vm_paging_lock);
5266                        if (vm_paging_base_address != 0) {
5267                                /* someone raced us and won: undo */
5268                                simple_unlock(&vm_paging_lock);
5269                                kr = vm_map_remove(kernel_map,
5270                                                   page_map_offset,
5271                                                   page_map_offset + 
5272                                                   (VM_PAGING_NUM_PAGES
5273                                                    * PAGE_SIZE),
5274                                                   VM_MAP_NO_FLAGS);
5275                                assert(kr == KERN_SUCCESS);
5276                                simple_lock(&vm_paging_lock);
5277                        } else {
5278                                vm_paging_base_address = page_map_offset;
5279                        }
5280                }
5281
5282                /*
5283                 * Try and find an available kernel virtual address
5284                 * from our pre-allocated pool.
5285                 */
5286                page_map_offset = 0;
5287                for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5288                        if (vm_paging_page_inuse[i] == FALSE) {
5289                                page_map_offset = vm_paging_base_address +
5290                                        (i * PAGE_SIZE);
5291                                break;
5292                        }
5293                }
5294
5295                if (page_map_offset != 0) {
5296                        /*
5297                         * We found a kernel virtual address;
5298                         * map the physical page to that virtual address.
5299                         */
5300                        if (i > vm_paging_max_index) {
5301                                vm_paging_max_index = i;
5302                        }
5303                        vm_paging_page_inuse[i] = TRUE;
5304                        simple_unlock(&vm_paging_lock);
5305                        pmap_map_block(kernel_pmap,
5306                                       page_map_offset,
5307                                       page->phys_page,
5308                                       1,                                               /* Size is number of 4k pages */
5309                                       VM_PROT_DEFAULT,
5310                                       ((int) page->object->wimg_bits &
5311                                        VM_WIMG_MASK),
5312                                       0);
5313                        vm_paging_objects_mapped++;
5314                        vm_paging_pages_mapped++; 
5315                        *address = page_map_offset;
5316                        vm_object_lock(object);
5317
5318                        /* all done and mapped, ready to use ! */
5319                        return KERN_SUCCESS;
5320                }
5321
5322                /*
5323                 * We ran out of pre-allocated kernel virtual
5324                 * addresses.  Just map the page in the kernel
5325                 * the slow and regular way.
5326                 */
5327                vm_paging_no_kernel_page++;
5328                simple_unlock(&vm_paging_lock);
5329                vm_object_lock(object);
5330        }
5331#endif /* __ppc__ */
5332
5333        object_offset = vm_object_trunc_page(offset);
5334        map_size = vm_map_round_page(*size);
5335
5336        /*
5337         * Try and map the required range of the object
5338         * in the kernel_map
5339         */
5340
5341        /* don't go beyond the object's end... */
5342        if (object_offset >= object->size) {
5343                map_size = 0;
5344        } else if (map_size > object->size - offset) {
5345                map_size = object->size - offset;
5346        }
5347
5348        vm_object_reference_locked(object);     /* for the map entry */
5349        vm_object_unlock(object);
5350
5351        kr = vm_map_enter(kernel_map,
5352                          address,
5353                          map_size,
5354                          0,
5355                          VM_FLAGS_ANYWHERE,
5356                          object,
5357                          object_offset,
5358                          FALSE,
5359                          VM_PROT_DEFAULT,
5360                          VM_PROT_ALL,
5361                          VM_INHERIT_NONE);
5362        if (kr != KERN_SUCCESS) {
5363                *address = 0;
5364                *size = 0;
5365                vm_object_deallocate(object);   /* for the map entry */
5366                return kr;
5367        }
5368
5369        *size = map_size;
5370
5371        /*
5372         * Enter the mapped pages in the page table now.
5373         */
5374        vm_object_lock(object);
5375        for (page_map_offset = 0;
5376             map_size != 0;
5377             map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5378                unsigned int    cache_attr;
5379
5380                page = vm_page_lookup(object, offset + page_map_offset);
5381                if (page == VM_PAGE_NULL) {
5382                        panic("vm_paging_map_object: no page !?");
5383                }
5384                if (page->no_isync == TRUE) {
5385                        pmap_sync_page_data_phys(page->phys_page);
5386                }
5387                cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5388
5389                PMAP_ENTER(kernel_pmap,
5390                           *address + page_map_offset,
5391                           page,
5392                           VM_PROT_DEFAULT,
5393                           cache_attr,
5394                           FALSE);
5395        }
5396                           
5397        vm_paging_objects_mapped_slow++;
5398        vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5399
5400        return KERN_SUCCESS;
5401}
5402
5403/*
5404 * ENCRYPTED SWAP:
5405 * vm_paging_unmap_object:
5406 *      Unmaps part of a VM object's pages from the kernel
5407 *      virtual address space.
5408 * Context:
5409 *      The VM object is locked.  This lock will get
5410 *      dropped and re-acquired though.
5411 */
5412void
5413vm_paging_unmap_object(
5414        vm_object_t     object,
5415        vm_map_offset_t start,
5416        vm_map_offset_t end)
5417{
5418        kern_return_t   kr;
5419#ifdef __ppc__
5420        int             i;
5421#endif /* __ppc__ */
5422
5423        if ((vm_paging_base_address != 0) &&
5424            ((start < vm_paging_base_address) ||
5425             (end > (vm_paging_base_address
5426                     + (VM_PAGING_NUM_PAGES * PAGE_SIZE))))) {
5427                /*
5428                 * We didn't use our pre-allocated pool of
5429                 * kernel virtual address.  Deallocate the
5430                 * virtual memory.
5431                 */
5432                if (object != VM_OBJECT_NULL) {
5433                        vm_object_unlock(object);
5434                }
5435                kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5436                if (object != VM_OBJECT_NULL) {
5437                        vm_object_lock(object);
5438                }
5439                assert(kr == KERN_SUCCESS);
5440        } else {
5441                /*
5442                 * We used a kernel virtual address from our
5443                 * pre-allocated pool.  Put it back in the pool
5444                 * for next time.
5445                 */
5446#ifdef __ppc__
5447                assert(end - start == PAGE_SIZE);
5448                i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5449
5450                /* undo the pmap mapping */
5451                mapping_remove(kernel_pmap, start);
5452
5453                simple_lock(&vm_paging_lock);
5454                vm_paging_page_inuse[i] = FALSE;
5455                simple_unlock(&vm_paging_lock);
5456#endif /* __ppc__ */
5457        }
5458}
5459
5460/*
5461 * Encryption data.
5462 * "iv" is the "initial vector".  Ideally, we want to
5463 * have a different one for each page we encrypt, so that
5464 * crackers can't find encryption patterns too easily.
5465 */
5466#define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
5467boolean_t               swap_crypt_ctx_initialized = FALSE;
5468aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
5469aes_ctx                 swap_crypt_ctx;
5470const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5471
5472#if DEBUG
5473boolean_t               swap_crypt_ctx_tested = FALSE;
5474unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5475unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5476unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5477#endif /* DEBUG */
5478
5479extern u_long random(void);
5480
5481/*
5482 * Initialize the encryption context: key and key size.
5483 */
5484void swap_crypt_ctx_initialize(void); /* forward */
5485void
5486swap_crypt_ctx_initialize(void)
5487{
5488        unsigned int    i;
5489
5490        /*
5491         * No need for locking to protect swap_crypt_ctx_initialized
5492         * because the first use of encryption will come from the
5493         * pageout thread (we won't pagein before there's been a pageout)
5494         * and there's only one pageout thread.
5495         */
5496        if (swap_crypt_ctx_initialized == FALSE) {
5497                for (i = 0;
5498                     i < (sizeof (swap_crypt_key) /
5499                          sizeof (swap_crypt_key[0]));
5500                     i++) {
5501                        swap_crypt_key[i] = random();
5502                }
5503                aes_encrypt_key((const unsigned char *) swap_crypt_key,
5504                                SWAP_CRYPT_AES_KEY_SIZE,
5505                                &swap_crypt_ctx.encrypt);
5506                aes_decrypt_key((const unsigned char *) swap_crypt_key,
5507                                SWAP_CRYPT_AES_KEY_SIZE,
5508                                &swap_crypt_ctx.decrypt);
5509                swap_crypt_ctx_initialized = TRUE;
5510        }
5511
5512#if DEBUG
5513        /*
5514         * Validate the encryption algorithms.
5515         */
5516        if (swap_crypt_ctx_tested == FALSE) {
5517                /* initialize */
5518                for (i = 0; i < 4096; i++) {
5519                        swap_crypt_test_page_ref[i] = (char) i;
5520                }
5521                /* encrypt */
5522                aes_encrypt_cbc(swap_crypt_test_page_ref,
5523                                swap_crypt_null_iv,
5524                                PAGE_SIZE / AES_BLOCK_SIZE,
5525                                swap_crypt_test_page_encrypt,
5526                                &swap_crypt_ctx.encrypt);
5527                /* decrypt */
5528                aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5529                                swap_crypt_null_iv,
5530                                PAGE_SIZE / AES_BLOCK_SIZE,
5531                                swap_crypt_test_page_decrypt,
5532                                &swap_crypt_ctx.decrypt);
5533                /* compare result with original */
5534                for (i = 0; i < 4096; i ++) {
5535                        if (swap_crypt_test_page_decrypt[i] !=
5536                            swap_crypt_test_page_ref[i]) {
5537                                panic("encryption test failed");
5538                        }
5539                }
5540
5541                /* encrypt again */
5542                aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5543                                swap_crypt_null_iv,
5544                                PAGE_SIZE / AES_BLOCK_SIZE,
5545                                swap_crypt_test_page_decrypt,
5546                                &swap_crypt_ctx.encrypt);
5547                /* decrypt in place */
5548                aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5549                                swap_crypt_null_iv,
5550                                PAGE_SIZE / AES_BLOCK_SIZE,
5551                                swap_crypt_test_page_decrypt,
5552                                &swap_crypt_ctx.decrypt);
5553                for (i = 0; i < 4096; i ++) {
5554                        if (swap_crypt_test_page_decrypt[i] !=
5555                            swap_crypt_test_page_ref[i]) {
5556                                panic("in place encryption test failed");
5557                        }
5558                }
5559
5560                swap_crypt_ctx_tested = TRUE;
5561        }
5562#endif /* DEBUG */
5563}
5564
5565/*
5566 * ENCRYPTED SWAP:
5567 * vm_page_encrypt:
5568 *      Encrypt the given page, for secure paging.
5569 *      The page might already be mapped at kernel virtual
5570 *      address "kernel_mapping_offset".  Otherwise, we need
5571 *      to map it.
5572 * 
5573 * Context:
5574 *      The page's object is locked, but this lock will be released
5575 *      and re-acquired.
5576 *      The page is busy and not accessible by users (not entered in any pmap).
5577 */
5578void
5579vm_page_encrypt(
5580        vm_page_t       page,
5581        vm_map_offset_t kernel_mapping_offset)
5582{
5583        int                     clear_refmod = 0;
5584        kern_return_t           kr;
5585        boolean_t               page_was_referenced;
5586        boolean_t               page_was_modified;
5587        vm_map_size_t           kernel_mapping_size;
5588        vm_offset_t             kernel_vaddr;
5589        union {
5590                unsigned char   aes_iv[AES_BLOCK_SIZE];
5591                struct {
5592                        memory_object_t         pager_object;
5593                        vm_object_offset_t      paging_offset;
5594                } vm;
5595        } encrypt_iv;
5596
5597        if (! vm_pages_encrypted) {
5598                vm_pages_encrypted = TRUE;
5599        }
5600
5601        assert(page->busy);
5602        assert(page->dirty || page->precious);
5603        
5604        if (page->encrypted) {
5605                /*
5606                 * Already encrypted: no need to do it again.
5607                 */
5608                vm_page_encrypt_already_encrypted_counter++;
5609                return;
5610        }
5611        ASSERT_PAGE_DECRYPTED(page);
5612
5613        /*
5614         * Gather the "reference" and "modified" status of the page.
5615         * We'll restore these values after the encryption, so that
5616         * the encryption is transparent to the rest of the system
5617         * and doesn't impact the VM's LRU logic.
5618         */
5619        page_was_referenced =
5620                (page->reference || pmap_is_referenced(page->phys_page));
5621        page_was_modified = 
5622                (page->dirty || pmap_is_modified(page->phys_page));
5623
5624        if (kernel_mapping_offset == 0) {
5625                /*
5626                 * The page hasn't already been mapped in kernel space
5627                 * by the caller.  Map it now, so that we can access
5628                 * its contents and encrypt them.
5629                 */
5630                kernel_mapping_size = PAGE_SIZE;
5631                kr = vm_paging_map_object(&kernel_mapping_offset,
5632                                          page,
5633                                          page->object,
5634                                          page->offset,
5635                                          &kernel_mapping_size);
5636                if (kr != KERN_SUCCESS) {
5637                        panic("vm_page_encrypt: "
5638                              "could not map page in kernel: 0x%x\n",
5639                              kr);
5640                }
5641        } else {
5642                kernel_mapping_size = 0;
5643        }
5644        kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5645
5646        if (swap_crypt_ctx_initialized == FALSE) {
5647                swap_crypt_ctx_initialize();
5648        }
5649        assert(swap_crypt_ctx_initialized);
5650
5651        /*
5652         * Prepare an "initial vector" for the encryption.
5653         * We use the "pager" and the "paging_offset" for that
5654         * page to obfuscate the encrypted data a bit more and
5655         * prevent crackers from finding patterns that they could
5656         * use to break the key.
5657         */
5658        bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5659        encrypt_iv.vm.pager_object = page->object->pager;
5660        encrypt_iv.vm.paging_offset =
5661                page->object->paging_offset + page->offset;
5662
5663        vm_object_unlock(page->object);
5664
5665        /* encrypt the "initial vector" */
5666        aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5667                        swap_crypt_null_iv,
5668                        1,
5669                        &encrypt_iv.aes_iv[0],
5670                        &swap_crypt_ctx.encrypt);
5671                  
5672        /*
5673         * Encrypt the page.
5674         */
5675        aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5676                        &