linux/Documentation/filesystems/Locking History
<<
>>
Prefs
   1        The text below describes the locking rules for VFS-related methods.
   2It is (believed to be) up-to-date. *Please*, if you change anything in
   3prototypes or locking protocols - update this file. And update the relevant
   4instances in the tree, don't leave that to maintainers of filesystems/devices/
   5etc. At the very least, put the list of dubious cases in the end of this file.
   6Don't turn it into log - maintainers of out-of-the-tree code are supposed to
   7be able to use diff(1).
   8        Thing currently missing here: socket operations. Alexey?
   9
  10--------------------------- dentry_operations --------------------------
  11prototypes:
  12        int (*d_revalidate)(struct dentry *, int);
  13        int (*d_hash) (struct dentry *, struct qstr *);
  14        int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
  15        int (*d_delete)(struct dentry *);
  16        void (*d_release)(struct dentry *);
  17        void (*d_iput)(struct dentry *, struct inode *);
  18        char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
  19
  20locking rules:
  21        none have BKL
  22                dcache_lock     rename_lock     ->d_lock        may block
  23d_revalidate:   no              no              no              yes
  24d_hash          no              no              no              yes
  25d_compare:      no              yes             no              no 
  26d_delete:       yes             no              yes             no
  27d_release:      no              no              no              yes
  28d_iput:         no              no              no              yes
  29d_dname:        no              no              no              no
  30
  31--------------------------- inode_operations --------------------------- 
  32prototypes:
  33        int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
  34        struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid
  35ata *);
  36        int (*link) (struct dentry *,struct inode *,struct dentry *);
  37        int (*unlink) (struct inode *,struct dentry *);
  38        int (*symlink) (struct inode *,struct dentry *,const char *);
  39        int (*mkdir) (struct inode *,struct dentry *,int);
  40        int (*rmdir) (struct inode *,struct dentry *);
  41        int (*mknod) (struct inode *,struct dentry *,int,dev_t);
  42        int (*rename) (struct inode *, struct dentry *,
  43                        struct inode *, struct dentry *);
  44        int (*readlink) (struct dentry *, char __user *,int);
  45        int (*follow_link) (struct dentry *, struct nameidata *);
  46        void (*truncate) (struct inode *);
  47        int (*permission) (struct inode *, int, struct nameidata *);
  48        int (*setattr) (struct dentry *, struct iattr *);
  49        int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
  50        int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
  51        ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
  52        ssize_t (*listxattr) (struct dentry *, char *, size_t);
  53        int (*removexattr) (struct dentry *, const char *);
  54
  55locking rules:
  56        all may block, none have BKL
  57                i_mutex(inode)
  58lookup:         yes
  59create:         yes
  60link:           yes (both)
  61mknod:          yes
  62symlink:        yes
  63mkdir:          yes
  64unlink:         yes (both)
  65rmdir:          yes (both)      (see below)
  66rename:         yes (all)       (see below)
  67readlink:       no
  68follow_link:    no
  69truncate:       yes             (see below)
  70setattr:        yes
  71permission:     no
  72getattr:        no
  73setxattr:       yes
  74getxattr:       no
  75listxattr:      no
  76removexattr:    yes
  77        Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
  78victim.
  79        cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
  80        ->truncate() is never called directly - it's a callback, not a
  81method. It's called by vmtruncate() - library function normally used by
  82->setattr(). Locking information above applies to that call (i.e. is
  83inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
  84passed).
  85
  86See Documentation/filesystems/directory-locking for more detailed discussion
  87of the locking scheme for directory operations.
  88
  89--------------------------- super_operations ---------------------------
  90prototypes:
  91        struct inode *(*alloc_inode)(struct super_block *sb);
  92        void (*destroy_inode)(struct inode *);
  93        void (*dirty_inode) (struct inode *);
  94        int (*write_inode) (struct inode *, int);
  95        void (*drop_inode) (struct inode *);
  96        void (*delete_inode) (struct inode *);
  97        void (*put_super) (struct super_block *);
  98        void (*write_super) (struct super_block *);
  99        int (*sync_fs)(struct super_block *sb, int wait);
 100        int (*freeze_fs) (struct super_block *);
 101        int (*unfreeze_fs) (struct super_block *);
 102        int (*statfs) (struct dentry *, struct kstatfs *);
 103        int (*remount_fs) (struct super_block *, int *, char *);
 104        void (*clear_inode) (struct inode *);
 105        void (*umount_begin) (struct super_block *);
 106        int (*show_options)(struct seq_file *, struct vfsmount *);
 107        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
 108        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 109
 110locking rules:
 111        All may block.
 112        None have BKL
 113                        s_umount
 114alloc_inode:
 115destroy_inode:
 116dirty_inode:                            (must not sleep)
 117write_inode:
 118drop_inode:                             !!!inode_lock!!!
 119delete_inode:
 120put_super:              write
 121write_super:            read
 122sync_fs:                read
 123freeze_fs:              read
 124unfreeze_fs:            read
 125statfs:                 no
 126remount_fs:             maybe           (see below)
 127clear_inode:
 128umount_begin:           no
 129show_options:           no              (namespace_sem)
 130quota_read:             no              (see below)
 131quota_write:            no              (see below)
 132
 133->remount_fs() will have the s_umount exclusive lock if it's already mounted.
 134When called from get_sb_single, it does NOT have the s_umount lock.
 135->quota_read() and ->quota_write() functions are both guaranteed to
 136be the only ones operating on the quota file by the quota code (via
 137dqio_sem) (unless an admin really wants to screw up something and
 138writes to quota files with quotas on). For other details about locking
 139see also dquot_operations section.
 140
 141--------------------------- file_system_type ---------------------------
 142prototypes:
 143        int (*get_sb) (struct file_system_type *, int,
 144                       const char *, void *, struct vfsmount *);
 145        void (*kill_sb) (struct super_block *);
 146locking rules:
 147                may block       BKL
 148get_sb          yes             no
 149kill_sb         yes             no
 150
 151->get_sb() returns error or 0 with locked superblock attached to the vfsmount
 152(exclusive on ->s_umount).
 153->kill_sb() takes a write-locked superblock, does all shutdown work on it,
 154unlocks and drops the reference.
 155
 156--------------------------- address_space_operations --------------------------
 157prototypes:
 158        int (*writepage)(struct page *page, struct writeback_control *wbc);
 159        int (*readpage)(struct file *, struct page *);
 160        int (*sync_page)(struct page *);
 161        int (*writepages)(struct address_space *, struct writeback_control *);
 162        int (*set_page_dirty)(struct page *page);
 163        int (*readpages)(struct file *filp, struct address_space *mapping,
 164                        struct list_head *pages, unsigned nr_pages);
 165        int (*write_begin)(struct file *, struct address_space *mapping,
 166                                loff_t pos, unsigned len, unsigned flags,
 167                                struct page **pagep, void **fsdata);
 168        int (*write_end)(struct file *, struct address_space *mapping,
 169                                loff_t pos, unsigned len, unsigned copied,
 170                                struct page *page, void *fsdata);
 171        sector_t (*bmap)(struct address_space *, sector_t);
 172        int (*invalidatepage) (struct page *, unsigned long);
 173        int (*releasepage) (struct page *, int);
 174        int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 175                        loff_t offset, unsigned long nr_segs);
 176        int (*launder_page) (struct page *);
 177
 178locking rules:
 179        All except set_page_dirty may block
 180
 181                        BKL     PageLocked(page)        i_sem
 182writepage:              no      yes, unlocks (see below)
 183readpage:               no      yes, unlocks
 184sync_page:              no      maybe
 185writepages:             no
 186set_page_dirty          no      no
 187readpages:              no
 188write_begin:            no      locks the page          yes
 189write_end:              no      yes, unlocks            yes
 190perform_write:          no      n/a                     yes
 191bmap:                   no
 192invalidatepage:         no      yes
 193releasepage:            no      yes
 194direct_IO:              no
 195launder_page:           no      yes
 196
 197        ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
 198may be called from the request handler (/dev/loop).
 199
 200        ->readpage() unlocks the page, either synchronously or via I/O
 201completion.
 202
 203        ->readpages() populates the pagecache with the passed pages and starts
 204I/O against them.  They come unlocked upon I/O completion.
 205
 206        ->writepage() is used for two purposes: for "memory cleansing" and for
 207"sync".  These are quite different operations and the behaviour may differ
 208depending upon the mode.
 209
 210If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then
 211it *must* start I/O against the page, even if that would involve
 212blocking on in-progress I/O.
 213
 214If writepage is called for memory cleansing (sync_mode ==
 215WBC_SYNC_NONE) then its role is to get as much writeout underway as
 216possible.  So writepage should try to avoid blocking against
 217currently-in-progress I/O.
 218
 219If the filesystem is not called for "sync" and it determines that it
 220would need to block against in-progress I/O to be able to start new I/O
 221against the page the filesystem should redirty the page with
 222redirty_page_for_writepage(), then unlock the page and return zero.
 223This may also be done to avoid internal deadlocks, but rarely.
 224
 225If the filesystem is called for sync then it must wait on any
 226in-progress I/O and then start new I/O.
 227
 228The filesystem should unlock the page synchronously, before returning to the
 229caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE
 230value. WRITEPAGE_ACTIVATE means that page cannot really be written out
 231currently, and VM should stop calling ->writepage() on this page for some
 232time. VM does this by moving page to the head of the active list, hence the
 233name.
 234
 235Unless the filesystem is going to redirty_page_for_writepage(), unlock the page
 236and return zero, writepage *must* run set_page_writeback() against the page,
 237followed by unlocking it.  Once set_page_writeback() has been run against the
 238page, write I/O can be submitted and the write I/O completion handler must run
 239end_page_writeback() once the I/O is complete.  If no I/O is submitted, the
 240filesystem must run end_page_writeback() against the page before returning from
 241writepage.
 242
 243That is: after 2.5.12, pages which are under writeout are *not* locked.  Note,
 244if the filesystem needs the page to be locked during writeout, that is ok, too,
 245the page is allowed to be unlocked at any point in time between the calls to
 246set_page_writeback() and end_page_writeback().
 247
 248Note, failure to run either redirty_page_for_writepage() or the combination of
 249set_page_writeback()/end_page_writeback() on a page submitted to writepage
 250will leave the page itself marked clean but it will be tagged as dirty in the
 251radix tree.  This incoherency can lead to all sorts of hard-to-debug problems
 252in the filesystem like having dirty inodes at umount and losing written data.
 253
 254        ->sync_page() locking rules are not well-defined - usually it is called
 255with lock on page, but that is not guaranteed. Considering the currently
 256existing instances of this method ->sync_page() itself doesn't look
 257well-defined...
 258
 259        ->writepages() is used for periodic writeback and for syscall-initiated
 260sync operations.  The address_space should start I/O against at least
 261*nr_to_write pages.  *nr_to_write must be decremented for each page which is
 262written.  The address_space implementation may write more (or less) pages
 263than *nr_to_write asks for, but it should try to be reasonably close.  If
 264nr_to_write is NULL, all dirty pages must be written.
 265
 266writepages should _only_ write pages which are present on
 267mapping->io_pages.
 268
 269        ->set_page_dirty() is called from various places in the kernel
 270when the target page is marked as needing writeback.  It may be called
 271under spinlock (it cannot block) and is sometimes called with the page
 272not locked.
 273
 274        ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
 275filesystems and by the swapper. The latter will eventually go away. All
 276instances do not actually need the BKL. Please, keep it that way and don't
 277breed new callers.
 278
 279        ->invalidatepage() is called when the filesystem must attempt to drop
 280some or all of the buffers from the page when it is being truncated.  It
 281returns zero on success.  If ->invalidatepage is zero, the kernel uses
 282block_invalidatepage() instead.
 283
 284        ->releasepage() is called when the kernel is about to try to drop the
 285buffers from the page in preparation for freeing it.  It returns zero to
 286indicate that the buffers are (or may be) freeable.  If ->releasepage is zero,
 287the kernel assumes that the fs has no private interest in the buffers.
 288
 289        ->launder_page() may be called prior to releasing a page if
 290it is still found to be dirty. It returns zero if the page was successfully
 291cleaned, or an error value if not. Note that in order to prevent the page
 292getting mapped back in and redirtied, it needs to be kept locked
 293across the entire operation.
 294
 295        Note: currently almost all instances of address_space methods are
 296using BKL for internal serialization and that's one of the worst sources
 297of contention. Normally they are calling library functions (in fs/buffer.c)
 298and pass foo_get_block() as a callback (on local block-based filesystems,
 299indeed). BKL is not needed for library stuff and is usually taken by
 300foo_get_block(). It's an overkill, since block bitmaps can be protected by
 301internal fs locking and real critical areas are much smaller than the areas
 302filesystems protect now.
 303
 304----------------------- file_lock_operations ------------------------------
 305prototypes:
 306        void (*fl_insert)(struct file_lock *);  /* lock insertion callback */
 307        void (*fl_remove)(struct file_lock *);  /* lock removal callback */
 308        void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 309        void (*fl_release_private)(struct file_lock *);
 310
 311
 312locking rules:
 313                        BKL     may block
 314fl_insert:              yes     no
 315fl_remove:              yes     no
 316fl_copy_lock:           yes     no
 317fl_release_private:     yes     yes
 318
 319----------------------- lock_manager_operations ---------------------------
 320prototypes:
 321        int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
 322        void (*fl_notify)(struct file_lock *);  /* unblock callback */
 323        void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 324        void (*fl_release_private)(struct file_lock *);
 325        void (*fl_break)(struct file_lock *); /* break_lease callback */
 326
 327locking rules:
 328                        BKL     may block
 329fl_compare_owner:       yes     no
 330fl_notify:              yes     no
 331fl_copy_lock:           yes     no
 332fl_release_private:     yes     yes
 333fl_break:               yes     no
 334
 335        Currently only NFSD and NLM provide instances of this class. None of the
 336them block. If you have out-of-tree instances - please, show up. Locking
 337in that area will change.
 338--------------------------- buffer_head -----------------------------------
 339prototypes:
 340        void (*b_end_io)(struct buffer_head *bh, int uptodate);
 341
 342locking rules:
 343        called from interrupts. In other words, extreme care is needed here.
 344bh is locked, but that's all warranties we have here. Currently only RAID1,
 345highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices
 346call this method upon the IO completion.
 347
 348--------------------------- block_device_operations -----------------------
 349prototypes:
 350        int (*open) (struct inode *, struct file *);
 351        int (*release) (struct inode *, struct file *);
 352        int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
 353        int (*media_changed) (struct gendisk *);
 354        int (*revalidate_disk) (struct gendisk *);
 355
 356locking rules:
 357                        BKL     bd_sem
 358open:                   yes     yes
 359release:                yes     yes
 360ioctl:                  yes     no
 361media_changed:          no      no
 362revalidate_disk:        no      no
 363
 364The last two are called only from check_disk_change().
 365
 366--------------------------- file_operations -------------------------------
 367prototypes:
 368        loff_t (*llseek) (struct file *, loff_t, int);
 369        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 370        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 371        ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 372        ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 373        int (*readdir) (struct file *, void *, filldir_t);
 374        unsigned int (*poll) (struct file *, struct poll_table_struct *);
 375        int (*ioctl) (struct inode *, struct file *, unsigned int,
 376                        unsigned long);
 377        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 378        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 379        int (*mmap) (struct file *, struct vm_area_struct *);
 380        int (*open) (struct inode *, struct file *);
 381        int (*flush) (struct file *);
 382        int (*release) (struct inode *, struct file *);
 383        int (*fsync) (struct file *, struct dentry *, int datasync);
 384        int (*aio_fsync) (struct kiocb *, int datasync);
 385        int (*fasync) (int, struct file *, int);
 386        int (*lock) (struct file *, int, struct file_lock *);
 387        ssize_t (*readv) (struct file *, const struct iovec *, unsigned long,
 388                        loff_t *);
 389        ssize_t (*writev) (struct file *, const struct iovec *, unsigned long,
 390                        loff_t *);
 391        ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t,
 392                        void __user *);
 393        ssize_t (*sendpage) (struct file *, struct page *, int, size_t,
 394                        loff_t *, int);
 395        unsigned long (*get_unmapped_area)(struct file *, unsigned long,
 396                        unsigned long, unsigned long, unsigned long);
 397        int (*check_flags)(int);
 398};
 399
 400locking rules:
 401        All may block.
 402                        BKL
 403llseek:                 no      (see below)
 404read:                   no
 405aio_read:               no
 406write:                  no
 407aio_write:              no
 408readdir:                no
 409poll:                   no
 410ioctl:                  yes     (see below)
 411unlocked_ioctl:         no      (see below)
 412compat_ioctl:           no
 413mmap:                   no
 414open:                   no
 415flush:                  no
 416release:                no
 417fsync:                  no      (see below)
 418aio_fsync:              no
 419fasync:                 no
 420lock:                   yes
 421readv:                  no
 422writev:                 no
 423sendfile:               no
 424sendpage:               no
 425get_unmapped_area:      no
 426check_flags:            no
 427
 428->llseek() locking has moved from llseek to the individual llseek
 429implementations.  If your fs is not using generic_file_llseek, you
 430need to acquire and release the appropriate locks in your ->llseek().
 431For many filesystems, it is probably safe to acquire the inode
 432semaphore.  Note some filesystems (i.e. remote ones) provide no
 433protection for i_size so you will need to use the BKL.
 434
 435Note: ext2_release() was *the* source of contention on fs-intensive
 436loads and dropping BKL on ->release() helps to get rid of that (we still
 437grab BKL for cases when we close a file that had been opened r/w, but that
 438can and should be done using the internal locking with smaller critical areas).
 439Current worst offender is ext2_get_block()...
 440
 441->fasync() is called without BKL protection, and is responsible for
 442maintaining the FASYNC bit in filp->f_flags.  Most instances call
 443fasync_helper(), which does that maintenance, so it's not normally
 444something one needs to worry about.  Return values > 0 will be mapped to
 445zero in the VFS layer.
 446
 447->readdir() and ->ioctl() on directories must be changed. Ideally we would
 448move ->readdir() to inode_operations and use a separate method for directory
 449->ioctl() or kill the latter completely. One of the problems is that for
 450anything that resembles union-mount we won't have a struct file for all
 451components. And there are other reasons why the current interface is a mess...
 452
 453->ioctl() on regular files is superceded by the ->unlocked_ioctl() that
 454doesn't take the BKL.
 455
 456->read on directories probably must go away - we should just enforce -EISDIR
 457in sys_read() and friends.
 458
 459->fsync() has i_mutex on inode.
 460
 461--------------------------- dquot_operations -------------------------------
 462prototypes:
 463        int (*initialize) (struct inode *, int);
 464        int (*drop) (struct inode *);
 465        int (*alloc_space) (struct inode *, qsize_t, int);
 466        int (*alloc_inode) (const struct inode *, unsigned long);
 467        int (*free_space) (struct inode *, qsize_t);
 468        int (*free_inode) (const struct inode *, unsigned long);
 469        int (*transfer) (struct inode *, struct iattr *);
 470        int (*write_dquot) (struct dquot *);
 471        int (*acquire_dquot) (struct dquot *);
 472        int (*release_dquot) (struct dquot *);
 473        int (*mark_dirty) (struct dquot *);
 474        int (*write_info) (struct super_block *, int);
 475
 476These operations are intended to be more or less wrapping functions that ensure
 477a proper locking wrt the filesystem and call the generic quota operations.
 478
 479What filesystem should expect from the generic quota functions:
 480
 481                FS recursion    Held locks when called
 482initialize:     yes             maybe dqonoff_sem
 483drop:           yes             -
 484alloc_space:    ->mark_dirty()  -
 485alloc_inode:    ->mark_dirty()  -
 486free_space:     ->mark_dirty()  -
 487free_inode:     ->mark_dirty()  -
 488transfer:       yes             -
 489write_dquot:    yes             dqonoff_sem or dqptr_sem
 490acquire_dquot:  yes             dqonoff_sem or dqptr_sem
 491release_dquot:  yes             dqonoff_sem or dqptr_sem
 492mark_dirty:     no              -
 493write_info:     yes             dqonoff_sem
 494
 495FS recursion means calling ->quota_read() and ->quota_write() from superblock
 496operations.
 497
 498->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
 499only directly by the filesystem and do not call any fs functions only
 500the ->mark_dirty() operation.
 501
 502More details about quota locking can be found in fs/dquot.c.
 503
 504--------------------------- vm_operations_struct -----------------------------
 505prototypes:
 506        void (*open)(struct vm_area_struct*);
 507        void (*close)(struct vm_area_struct*);
 508        int (*fault)(struct vm_area_struct*, struct vm_fault *);
 509        int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
 510        int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
 511
 512locking rules:
 513                BKL     mmap_sem        PageLocked(page)
 514open:           no      yes
 515close:          no      yes
 516fault:          no      yes             can return with page locked
 517page_mkwrite:   no      yes             can return with page locked
 518access:         no      yes
 519
 520        ->fault() is called when a previously not present pte is about
 521to be faulted in. The filesystem must find and return the page associated
 522with the passed in "pgoff" in the vm_fault structure. If it is possible that
 523the page may be truncated and/or invalidated, then the filesystem must lock
 524the page, then ensure it is not already truncated (the page lock will block
 525subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
 526locked. The VM will unlock the page.
 527
 528        ->page_mkwrite() is called when a previously read-only pte is
 529about to become writeable. The filesystem again must ensure that there are
 530no truncate/invalidate races, and then return with the page locked. If
 531the page has been truncated, the filesystem should not look up a new page
 532like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
 533will cause the VM to retry the fault.
 534
 535        ->access() is called when get_user_pages() fails in
 536acces_process_vm(), typically used to debug a process through
 537/proc/pid/mem or ptrace.  This function is needed only for
 538VM_IO | VM_PFNMAP VMAs.
 539
 540================================================================================
 541                        Dubious stuff
 542
 543(if you break something or notice that it is broken and do not fix it yourself
 544- at least put it here)
 545
 546ipc/shm.c::shm_delete() - may need BKL.
 547->read() and ->write() in many drivers are (probably) missing BKL.
 548
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.