linux-old/arch/mips64/lib/memcpy.S
<<
>>
Prefs
   1/*
   2 * This file is subject to the terms and conditions of the GNU General Public
   3 * License.  See the file "COPYING" in the main directory of this archive
   4 * for more details.
   5 *
   6 * Unified implementation of memcpy, memmove and the __copy_user backend.
   7 *
   8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10 * Copyright (C) 2002 Broadcom, Inc.
  11 *   memcpy/copy_user author: Mark Vandevoorde
  12 *
  13 * Mnemonic names for arguments to memcpy/__copy_user
  14 */
  15#include <linux/config.h>
  16#include <asm/asm.h>
  17#include <asm/offset.h>
  18#include <asm/regdef.h>
  19
  20#define dst a0
  21#define src a1
  22#define len a2
  23
  24/*
  25 * Spec
  26 *
  27 * memcpy copies len bytes from src to dst and sets v0 to dst.
  28 * It assumes that
  29 *   - src and dst don't overlap
  30 *   - src is readable
  31 *   - dst is writable
  32 * memcpy uses the standard calling convention
  33 *
  34 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  35 * the number of uncopied bytes due to an exception caused by a read or write.
  36 * __copy_user assumes that src and dst don't overlap, and that the call is
  37 * implementing one of the following:
  38 *   copy_to_user
  39 *     - src is readable  (no exceptions when reading src)
  40 *   copy_from_user
  41 *     - dst is writable  (no exceptions when writing dst)
  42 * __copy_user uses a non-standard calling convention; see
  43 * include/asm-mips/uaccess.h
  44 *
  45 * When an exception happens on a load, the handler must
  46 # ensure that all of the destination buffer is overwritten to prevent
  47 * leaking information to user mode programs.
  48 */
  49
  50/*
  51 * Implementation
  52 */
  53
  54/*
  55 * The exception handler for loads requires that:
  56 *  1- AT contain the address of the byte just past the end of the source
  57 *     of the copy,
  58 *  2- src_entry <= src < AT, and
  59 *  3- (dst - src) == (dst_entry - src_entry),
  60 * The _entry suffix denotes values when __copy_user was called.
  61 *
  62 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  63 * (2) is met by incrementing src by the number of bytes copied
  64 * (3) is met by not doing loads between a pair of increments of dst and src
  65 *
  66 * The exception handlers for stores adjust len (if necessary) and return.
  67 * These handlers do not need to overwrite any data.
  68 *
  69 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  70 * they're not protected.
  71 */
  72
  73#define EXC(inst_reg,addr,handler)              \
  749:      inst_reg, addr;                         \
  75        .section __ex_table,"a";                \
  76        PTR     9b, handler;                    \
  77        .previous
  78
  79/*
  80 * Only on the 64-bit kernel we can made use of 64-bit registers.
  81 */
  82#ifdef CONFIG_MIPS64
  83#define USE_DOUBLE
  84#endif
  85
  86#ifdef USE_DOUBLE
  87
  88#define LOAD   ld
  89#define LOADL  ldl
  90#define LOADR  ldr
  91#define STOREL sdl
  92#define STORER sdr
  93#define STORE  sd
  94#define ADD    daddu
  95#define SUB    dsubu
  96#define SRL    dsrl
  97#define SRA    dsra
  98#define SLL    dsll
  99#define SLLV   dsllv
 100#define SRLV   dsrlv
 101#define NBYTES 8
 102#define LOG_NBYTES 3
 103
 104/* 
 105 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 106 * register definitions). We need to redefine the register definitions from
 107 * the n64 ABI register naming to the o32 ABI register naming.
 108 */
 109#undef t0
 110#undef t1
 111#undef t2
 112#undef t3
 113#define t0      $8
 114#define t1      $9
 115#define t2      $10
 116#define t3      $11
 117#define t4      $12
 118#define t5      $13
 119#define t6      $14
 120#define t7      $15
 121        
 122#else
 123
 124#define LOAD   lw
 125#define LOADL  lwl
 126#define LOADR  lwr
 127#define STOREL swl
 128#define STORER swr
 129#define STORE  sw
 130#define ADD    addu
 131#define SUB    subu
 132#define SRL    srl
 133#define SLL    sll
 134#define SRA    sra
 135#define SLLV   sllv
 136#define SRLV   srlv
 137#define NBYTES 4
 138#define LOG_NBYTES 2
 139
 140#endif /* USE_DOUBLE */
 141
 142#ifdef CONFIG_CPU_LITTLE_ENDIAN
 143#define LDFIRST LOADR
 144#define LDREST  LOADL
 145#define STFIRST STORER
 146#define STREST  STOREL
 147#define SHIFT_DISCARD SLLV
 148#else
 149#define LDFIRST LOADL
 150#define LDREST  LOADR
 151#define STFIRST STOREL
 152#define STREST  STORER
 153#define SHIFT_DISCARD SRLV
 154#endif
 155
 156#define FIRST(unit) ((unit)*NBYTES)
 157#define REST(unit)  (FIRST(unit)+NBYTES-1)
 158#define UNIT(unit)  FIRST(unit)
 159
 160#define ADDRMASK (NBYTES-1)
 161
 162        .text
 163        .set    noreorder
 164        .set    noat
 165
 166/*
 167 * A combined memcpy/__copy_user
 168 * __copy_user sets len to 0 for success; else to an upper bound of
 169 * the number of uncopied bytes.
 170 * memcpy sets v0 to dst.
 171 */
 172        .align  5
 173LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 174        move    v0, dst                         /* return value */
 175__memcpy:
 176FEXPORT(__copy_user)
 177        /*
 178         * Note: dst & src may be unaligned, len may be 0
 179         * Temps
 180         */
 181#define rem t8
 182
 183        /*
 184         * The "issue break"s below are very approximate.
 185         * Issue delays for dcache fills will perturb the schedule, as will
 186         * load queue full replay traps, etc.
 187         *
 188         * If len < NBYTES use byte operations.
 189         */
 190        PREF(   0, 0(src) )
 191        PREF(   1, 0(dst) )
 192        sltu    t2, len, NBYTES
 193        and     t1, dst, ADDRMASK
 194        PREF(   0, 1*32(src) )
 195        PREF(   1, 1*32(dst) )
 196        bnez    t2, copy_bytes_checklen
 197         and    t0, src, ADDRMASK
 198        PREF(   0, 2*32(src) )
 199        PREF(   1, 2*32(dst) )
 200        bnez    t1, dst_unaligned
 201         nop
 202        bnez    t0, src_unaligned_dst_aligned
 203        /*
 204         * use delay slot for fall-through
 205         * src and dst are aligned; need to compute rem
 206         */
 207both_aligned:
 208         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 209        beqz    t0, cleanup_both_aligned # len < 8*NBYTES
 210         and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
 211        PREF(   0, 3*32(src) )
 212        PREF(   1, 3*32(dst) )
 213        .align  4
 2141:
 215EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 216EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 217EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 218EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 219        SUB     len, len, 8*NBYTES
 220EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
 221EXC(    LOAD    t7, UNIT(5)(src),       l_exc_copy)
 222EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p8u)
 223EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p7u)
 224EXC(    LOAD    t0, UNIT(6)(src),       l_exc_copy)
 225EXC(    LOAD    t1, UNIT(7)(src),       l_exc_copy)
 226        ADD     src, src, 8*NBYTES
 227        ADD     dst, dst, 8*NBYTES
 228EXC(    STORE   t2, UNIT(-6)(dst),      s_exc_p6u)
 229EXC(    STORE   t3, UNIT(-5)(dst),      s_exc_p5u)
 230EXC(    STORE   t4, UNIT(-4)(dst),      s_exc_p4u)
 231EXC(    STORE   t7, UNIT(-3)(dst),      s_exc_p3u)
 232EXC(    STORE   t0, UNIT(-2)(dst),      s_exc_p2u)
 233EXC(    STORE   t1, UNIT(-1)(dst),      s_exc_p1u)
 234        PREF(   0, 8*32(src) )
 235        PREF(   1, 8*32(dst) )
 236        bne     len, rem, 1b
 237         nop
 238
 239        /*
 240         * len == rem == the number of bytes left to copy < 8*NBYTES
 241         */
 242cleanup_both_aligned:
 243        beqz    len, done
 244         sltu   t0, len, 4*NBYTES
 245        bnez    t0, less_than_4units
 246         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 247        /*
 248         * len >= 4*NBYTES
 249         */
 250EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 251EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 252EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 253EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 254        SUB     len, len, 4*NBYTES
 255        ADD     src, src, 4*NBYTES
 256EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 257EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 258EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 259EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 260        beqz    len, done
 261         ADD    dst, dst, 4*NBYTES
 262less_than_4units:
 263        /*
 264         * rem = len % NBYTES
 265         */
 266        beq     rem, len, copy_bytes
 267         nop
 2681:
 269EXC(    LOAD    t0, 0(src),             l_exc)
 270        ADD     src, src, NBYTES
 271        SUB     len, len, NBYTES
 272EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 273        bne     rem, len, 1b
 274         ADD    dst, dst, NBYTES
 275
 276        /*
 277         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 278         * A loop would do only a byte at a time with possible branch
 279         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 280         * because can't assume read-access to dst.  Instead, use
 281         * STREST dst, which doesn't require read access to dst.
 282         *
 283         * This code should perform better than a simple loop on modern,
 284         * wide-issue mips processors because the code has fewer branches and
 285         * more instruction-level parallelism.
 286         */
 287#define bits t2
 288        beqz    len, done
 289         ADD    t1, dst, len    # t1 is just past last byte of dst
 290        li      bits, 8*NBYTES
 291        SLL     rem, len, 3     # rem = number of bits to keep
 292EXC(    LOAD    t0, 0(src),             l_exc)
 293        SUB     bits, bits, rem # bits = number of bits to discard
 294        SHIFT_DISCARD t0, t0, bits
 295EXC(    STREST  t0, -1(t1),             s_exc)
 296        jr      ra
 297         move   len, zero
 298dst_unaligned:
 299        /*
 300         * dst is unaligned
 301         * t0 = src & ADDRMASK
 302         * t1 = dst & ADDRMASK; T1 > 0
 303         * len >= NBYTES
 304         *
 305         * Copy enough bytes to align dst
 306         * Set match = (src and dst have same alignment)
 307         */
 308#define match rem
 309EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
 310        ADD     t2, zero, NBYTES
 311EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
 312        SUB     t2, t2, t1      # t2 = number of bytes copied
 313        xor     match, t0, t1
 314EXC(    STFIRST t3, FIRST(0)(dst),      s_exc)
 315        beq     len, t2, done
 316         SUB    len, len, t2
 317        ADD     dst, dst, t2
 318        beqz    match, both_aligned
 319         ADD    src, src, t2
 320
 321src_unaligned_dst_aligned:
 322        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 323        PREF(   0, 3*32(src) )
 324        beqz    t0, cleanup_src_unaligned
 325         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 326        PREF(   1, 3*32(dst) )
 3271:
 328/*
 329 * Avoid consecutive LD*'s to the same register since some mips
 330 * implementations can't issue them in the same cycle.
 331 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 332 * are to the same unit (unless src is aligned, but it's not).
 333 */
 334EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 335EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
 336        SUB     len, len, 4*NBYTES
 337EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 338EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
 339EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
 340EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
 341EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
 342EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
 343        PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 344        ADD     src, src, 4*NBYTES
 345#ifdef CONFIG_CPU_SB1
 346        nop                             # improves slotting
 347#endif
 348EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 349EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 350EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 351EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 352        PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 353        bne     len, rem, 1b
 354         ADD    dst, dst, 4*NBYTES
 355
 356cleanup_src_unaligned:
 357        beqz    len, done
 358         and    rem, len, NBYTES-1  # rem = len % NBYTES
 359        beq     rem, len, copy_bytes
 360         nop
 3611:
 362EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 363EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 364        ADD     src, src, NBYTES
 365        SUB     len, len, NBYTES
 366EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 367        bne     len, rem, 1b
 368         ADD    dst, dst, NBYTES
 369
 370copy_bytes_checklen:
 371        beqz    len, done
 372         nop
 373copy_bytes:
 374        /* 0 < len < NBYTES  */
 375#define COPY_BYTE(N)                    \
 376EXC(    lb      t0, N(src), l_exc);     \
 377        SUB     len, len, 1;            \
 378        beqz    len, done;              \
 379EXC(     sb     t0, N(dst), s_exc_p1)
 380
 381        COPY_BYTE(0)
 382        COPY_BYTE(1)
 383#ifdef USE_DOUBLE
 384        COPY_BYTE(2)
 385        COPY_BYTE(3)
 386        COPY_BYTE(4)
 387        COPY_BYTE(5)
 388#endif
 389EXC(    lb      t0, NBYTES-2(src), l_exc)
 390        SUB     len, len, 1
 391        jr      ra
 392EXC(     sb     t0, NBYTES-2(dst), s_exc_p1)
 393done:
 394        jr      ra
 395         nop
 396        END(memcpy)
 397
 398l_exc_copy:
 399        /*
 400         * Copy bytes from src until faulting load address (or until a
 401         * lb faults)
 402         *
 403         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 404         * may be more than a byte beyond the last address.
 405         * Hence, the lb below may get an exception.
 406         *
 407         * Assumes src < THREAD_BUADDR($28)
 408         */
 409        LOAD    t0, THREAD_BUADDR($28)
 4101:
 411EXC(    lb      t1, 0(src),     l_exc)
 412        ADD     src, src, 1
 413        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 414        bne     src, t0, 1b
 415         ADD    dst, dst, 1
 416l_exc:
 417        LOAD    t0, THREAD_BUADDR($28)  # t0 is just past last good address
 418         nop
 419        SUB     len, AT, t0             # len number of uncopied bytes
 420        /*
 421         * Here's where we rely on src and dst being incremented in tandem,
 422         *   See (3) above.
 423         * dst += (fault addr - src) to put dst at first byte to clear
 424         */
 425        ADD     dst, t0                 # compute start address in a1
 426        SUB     dst, src
 427        /*
 428         * Clear len bytes starting at dst.  Can't call __bzero because it
 429         * might modify len.  An inefficient loop for these rare times...
 430         */
 431        beqz    len, done
 432         SUB    src, len, 1
 4331:      sb      zero, 0(dst)
 434        ADD     dst, dst, 1
 435        bnez    src, 1b
 436         SUB    src, src, 1
 437        jr      ra
 438         nop
 439
 440
 441#define SEXC(n)                         \
 442s_exc_p ## n ## u:                      \
 443        jr      ra;                     \
 444         ADD    len, len, n*NBYTES
 445
 446SEXC(8)
 447SEXC(7)
 448SEXC(6)
 449SEXC(5)
 450SEXC(4)
 451SEXC(3)
 452SEXC(2)
 453SEXC(1)
 454
 455s_exc_p1:
 456        jr      ra
 457         ADD    len, len, 1
 458s_exc:
 459        jr      ra
 460         nop
 461
 462        .align  5
 463LEAF(memmove)
 464        ADD     t0, a0, a2
 465        ADD     t1, a1, a2
 466        sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 467        sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 468        and     t0, t1
 469        beqz    t0, __memcpy
 470         move   v0, a0                          /* return value */
 471        beqz    a2, r_out
 472        END(memmove)
 473
 474        /* fall through to __rmemcpy */
 475LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 476         sltu   t0, a1, a0
 477        beqz    t0, r_end_bytes_up              # src >= dst
 478         nop
 479        ADD     a0, a2                          # dst = dst + len
 480        ADD     a1, a2                          # src = src + len
 481
 482r_end_bytes:
 483        lb      t0, -1(a1)
 484        SUB     a2, a2, 0x1
 485        sb      t0, -1(a0)
 486        SUB     a1, a1, 0x1
 487        bnez    a2, r_end_bytes
 488         SUB    a0, a0, 0x1
 489
 490r_out:
 491        jr      ra
 492         move   a2, zero
 493
 494r_end_bytes_up:
 495        lb      t0, (a1)
 496        SUB     a2, a2, 0x1
 497        sb      t0, (a0)
 498        ADD     a1, a1, 0x1
 499        bnez    a2, r_end_bytes_up
 500         ADD    a0, a0, 0x1
 501
 502        jr      ra
 503         move   a2, zero
 504        END(__rmemcpy)
 505
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.