linux/arch/i386/lib/mmx.c
<<
>>
Prefs
   1#include <linux/types.h>
   2#include <linux/string.h>
   3#include <linux/sched.h>
   4#include <linux/hardirq.h>
   5#include <linux/module.h>
   6
   7#include <asm/i387.h>
   8
   9
  10/*
  11 *      MMX 3DNow! library helper functions
  12 *
  13 *      To do:
  14 *      We can use MMX just for prefetch in IRQ's. This may be a win. 
  15 *              (reported so on K6-III)
  16 *      We should use a better code neutral filler for the short jump
  17 *              leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
  18 *      We also want to clobber the filler register so we don't get any
  19 *              register forwarding stalls on the filler. 
  20 *
  21 *      Add *user handling. Checksums are not a win with MMX on any CPU
  22 *      tested so far for any MMX solution figured.
  23 *
  24 *      22/09/2000 - Arjan van de Ven 
  25 *              Improved for non-egineering-sample Athlons 
  26 *
  27 */
  28 
  29void *_mmx_memcpy(void *to, const void *from, size_t len)
  30{
  31        void *p;
  32        int i;
  33
  34        if (unlikely(in_interrupt()))
  35                return __memcpy(to, from, len);
  36
  37        p = to;
  38        i = len >> 6; /* len/64 */
  39
  40        kernel_fpu_begin();
  41
  42        __asm__ __volatile__ (
  43                "1: prefetch (%0)\n"            /* This set is 28 bytes */
  44                "   prefetch 64(%0)\n"
  45                "   prefetch 128(%0)\n"
  46                "   prefetch 192(%0)\n"
  47                "   prefetch 256(%0)\n"
  48                "2:  \n"
  49                ".section .fixup, \"ax\"\n"
  50                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  51                "   jmp 2b\n"
  52                ".previous\n"
  53                ".section __ex_table,\"a\"\n"
  54                "       .align 4\n"
  55                "       .long 1b, 3b\n"
  56                ".previous"
  57                : : "r" (from) );
  58                
  59        
  60        for(; i>5; i--)
  61        {
  62                __asm__ __volatile__ (
  63                "1:  prefetch 320(%0)\n"
  64                "2:  movq (%0), %%mm0\n"
  65                "  movq 8(%0), %%mm1\n"
  66                "  movq 16(%0), %%mm2\n"
  67                "  movq 24(%0), %%mm3\n"
  68                "  movq %%mm0, (%1)\n"
  69                "  movq %%mm1, 8(%1)\n"
  70                "  movq %%mm2, 16(%1)\n"
  71                "  movq %%mm3, 24(%1)\n"
  72                "  movq 32(%0), %%mm0\n"
  73                "  movq 40(%0), %%mm1\n"
  74                "  movq 48(%0), %%mm2\n"
  75                "  movq 56(%0), %%mm3\n"
  76                "  movq %%mm0, 32(%1)\n"
  77                "  movq %%mm1, 40(%1)\n"
  78                "  movq %%mm2, 48(%1)\n"
  79                "  movq %%mm3, 56(%1)\n"
  80                ".section .fixup, \"ax\"\n"
  81                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  82                "   jmp 2b\n"
  83                ".previous\n"
  84                ".section __ex_table,\"a\"\n"
  85                "       .align 4\n"
  86                "       .long 1b, 3b\n"
  87                ".previous"
  88                : : "r" (from), "r" (to) : "memory");
  89                from+=64;
  90                to+=64;
  91        }
  92
  93        for(; i>0; i--)
  94        {
  95                __asm__ __volatile__ (
  96                "  movq (%0), %%mm0\n"
  97                "  movq 8(%0), %%mm1\n"
  98                "  movq 16(%0), %%mm2\n"
  99                "  movq 24(%0), %%mm3\n"
 100                "  movq %%mm0, (%1)\n"
 101                "  movq %%mm1, 8(%1)\n"
 102                "  movq %%mm2, 16(%1)\n"
 103                "  movq %%mm3, 24(%1)\n"
 104                "  movq 32(%0), %%mm0\n"
 105                "  movq 40(%0), %%mm1\n"
 106                "  movq 48(%0), %%mm2\n"
 107                "  movq 56(%0), %%mm3\n"
 108                "  movq %%mm0, 32(%1)\n"
 109                "  movq %%mm1, 40(%1)\n"
 110                "  movq %%mm2, 48(%1)\n"
 111                "  movq %%mm3, 56(%1)\n"
 112                : : "r" (from), "r" (to) : "memory");
 113                from+=64;
 114                to+=64;
 115        }
 116        /*
 117         *      Now do the tail of the block
 118         */
 119        __memcpy(to, from, len&63);
 120        kernel_fpu_end();
 121        return p;
 122}
 123
 124#ifdef CONFIG_MK7
 125
 126/*
 127 *      The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
 128 *      other MMX using processors do not.
 129 */
 130
 131static void fast_clear_page(void *page)
 132{
 133        int i;
 134
 135        kernel_fpu_begin();
 136        
 137        __asm__ __volatile__ (
 138                "  pxor %%mm0, %%mm0\n" : :
 139        );
 140
 141        for(i=0;i<4096/64;i++)
 142        {
 143                __asm__ __volatile__ (
 144                "  movntq %%mm0, (%0)\n"
 145                "  movntq %%mm0, 8(%0)\n"
 146                "  movntq %%mm0, 16(%0)\n"
 147                "  movntq %%mm0, 24(%0)\n"
 148                "  movntq %%mm0, 32(%0)\n"
 149                "  movntq %%mm0, 40(%0)\n"
 150                "  movntq %%mm0, 48(%0)\n"
 151                "  movntq %%mm0, 56(%0)\n"
 152                : : "r" (page) : "memory");
 153                page+=64;
 154        }
 155        /* since movntq is weakly-ordered, a "sfence" is needed to become
 156         * ordered again.
 157         */
 158        __asm__ __volatile__ (
 159                "  sfence \n" : :
 160        );
 161        kernel_fpu_end();
 162}
 163
 164static void fast_copy_page(void *to, void *from)
 165{
 166        int i;
 167
 168        kernel_fpu_begin();
 169
 170        /* maybe the prefetch stuff can go before the expensive fnsave...
 171         * but that is for later. -AV
 172         */
 173        __asm__ __volatile__ (
 174                "1: prefetch (%0)\n"
 175                "   prefetch 64(%0)\n"
 176                "   prefetch 128(%0)\n"
 177                "   prefetch 192(%0)\n"
 178                "   prefetch 256(%0)\n"
 179                "2:  \n"
 180                ".section .fixup, \"ax\"\n"
 181                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
 182                "   jmp 2b\n"
 183                ".previous\n"
 184                ".section __ex_table,\"a\"\n"
 185                "       .align 4\n"
 186                "       .long 1b, 3b\n"
 187                ".previous"
 188                : : "r" (from) );
 189
 190        for(i=0; i<(4096-320)/64; i++)
 191        {
 192                __asm__ __volatile__ (
 193                "1: prefetch 320(%0)\n"
 194                "2: movq (%0), %%mm0\n"
 195                "   movntq %%mm0, (%1)\n"
 196                "   movq 8(%0), %%mm1\n"
 197                "   movntq %%mm1, 8(%1)\n"
 198                "   movq 16(%0), %%mm2\n"
 199                "   movntq %%mm2, 16(%1)\n"
 200                "   movq 24(%0), %%mm3\n"
 201                "   movntq %%mm3, 24(%1)\n"
 202                "   movq 32(%0), %%mm4\n"
 203                "   movntq %%mm4, 32(%1)\n"
 204                "   movq 40(%0), %%mm5\n"
 205                "   movntq %%mm5, 40(%1)\n"
 206                "   movq 48(%0), %%mm6\n"
 207                "   movntq %%mm6, 48(%1)\n"
 208                "   movq 56(%0), %%mm7\n"
 209                "   movntq %%mm7, 56(%1)\n"
 210                ".section .fixup, \"ax\"\n"
 211                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
 212                "   jmp 2b\n"
 213                ".previous\n"
 214                ".section __ex_table,\"a\"\n"
 215                "       .align 4\n"
 216                "       .long 1b, 3b\n"
 217                ".previous"
 218                : : "r" (from), "r" (to) : "memory");
 219                from+=64;
 220                to+=64;
 221        }
 222        for(i=(4096-320)/64; i<4096/64; i++)
 223        {
 224                __asm__ __volatile__ (
 225                "2: movq (%0), %%mm0\n"
 226                "   movntq %%mm0, (%1)\n"
 227                "   movq 8(%0), %%mm1\n"
 228                "   movntq %%mm1, 8(%1)\n"
 229                "   movq 16(%0), %%mm2\n"
 230                "   movntq %%mm2, 16(%1)\n"
 231                "   movq 24(%0), %%mm3\n"
 232                "   movntq %%mm3, 24(%1)\n"
 233                "   movq 32(%0), %%mm4\n"
 234                "   movntq %%mm4, 32(%1)\n"
 235                "   movq 40(%0), %%mm5\n"
 236                "   movntq %%mm5, 40(%1)\n"
 237                "   movq 48(%0), %%mm6\n"
 238                "   movntq %%mm6, 48(%1)\n"
 239                "   movq 56(%0), %%mm7\n"
 240                "   movntq %%mm7, 56(%1)\n"
 241                : : "r" (from), "r" (to) : "memory");
 242                from+=64;
 243                to+=64;
 244        }
 245        /* since movntq is weakly-ordered, a "sfence" is needed to become
 246         * ordered again.
 247         */
 248        __asm__ __volatile__ (
 249                "  sfence \n" : :
 250        );
 251        kernel_fpu_end();
 252}
 253
 254#else
 255
 256/*
 257 *      Generic MMX implementation without K7 specific streaming
 258 */
 259 
 260static void fast_clear_page(void *page)
 261{
 262        int i;
 263        
 264        kernel_fpu_begin();
 265        
 266        __asm__ __volatile__ (
 267                "  pxor %%mm0, %%mm0\n" : :
 268        );
 269
 270        for(i=0;i<4096/128;i++)
 271        {
 272                __asm__ __volatile__ (
 273                "  movq %%mm0, (%0)\n"
 274                "  movq %%mm0, 8(%0)\n"
 275                "  movq %%mm0, 16(%0)\n"
 276                "  movq %%mm0, 24(%0)\n"
 277                "  movq %%mm0, 32(%0)\n"
 278                "  movq %%mm0, 40(%0)\n"
 279                "  movq %%mm0, 48(%0)\n"
 280                "  movq %%mm0, 56(%0)\n"
 281                "  movq %%mm0, 64(%0)\n"
 282                "  movq %%mm0, 72(%0)\n"
 283                "  movq %%mm0, 80(%0)\n"
 284                "  movq %%mm0, 88(%0)\n"
 285                "  movq %%mm0, 96(%0)\n"
 286                "  movq %%mm0, 104(%0)\n"
 287                "  movq %%mm0, 112(%0)\n"
 288                "  movq %%mm0, 120(%0)\n"
 289                : : "r" (page) : "memory");
 290                page+=128;
 291        }
 292
 293        kernel_fpu_end();
 294}
 295
 296static void fast_copy_page(void *to, void *from)
 297{
 298        int i;
 299        
 300        
 301        kernel_fpu_begin();
 302
 303        __asm__ __volatile__ (
 304                "1: prefetch (%0)\n"
 305                "   prefetch 64(%0)\n"
 306                "   prefetch 128(%0)\n"
 307                "   prefetch 192(%0)\n"
 308                "   prefetch 256(%0)\n"
 309                "2:  \n"
 310                ".section .fixup, \"ax\"\n"
 311                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
 312                "   jmp 2b\n"
 313                ".previous\n"
 314                ".section __ex_table,\"a\"\n"
 315                "       .align 4\n"
 316                "       .long 1b, 3b\n"
 317                ".previous"
 318                : : "r" (from) );
 319
 320        for(i=0; i<4096/64; i++)
 321        {
 322                __asm__ __volatile__ (
 323                "1: prefetch 320(%0)\n"
 324                "2: movq (%0), %%mm0\n"
 325                "   movq 8(%0), %%mm1\n"
 326                "   movq 16(%0), %%mm2\n"
 327                "   movq 24(%0), %%mm3\n"
 328                "   movq %%mm0, (%1)\n"
 329                "   movq %%mm1, 8(%1)\n"
 330                "   movq %%mm2, 16(%1)\n"
 331                "   movq %%mm3, 24(%1)\n"
 332                "   movq 32(%0), %%mm0\n"
 333                "   movq 40(%0), %%mm1\n"
 334                "   movq 48(%0), %%mm2\n"
 335                "   movq 56(%0), %%mm3\n"
 336                "   movq %%mm0, 32(%1)\n"
 337                "   movq %%mm1, 40(%1)\n"
 338                "   movq %%mm2, 48(%1)\n"
 339                "   movq %%mm3, 56(%1)\n"
 340                ".section .fixup, \"ax\"\n"
 341                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
 342                "   jmp 2b\n"
 343                ".previous\n"
 344                ".section __ex_table,\"a\"\n"
 345                "       .align 4\n"
 346                "       .long 1b, 3b\n"
 347                ".previous"
 348                : : "r" (from), "r" (to) : "memory");
 349                from+=64;
 350                to+=64;
 351        }
 352        kernel_fpu_end();
 353}
 354
 355
 356#endif
 357
 358/*
 359 *      Favour MMX for page clear and copy. 
 360 */
 361
 362static void slow_zero_page(void * page)
 363{
 364        int d0, d1;
 365        __asm__ __volatile__( \
 366                "cld\n\t" \
 367                "rep ; stosl" \
 368                : "=&c" (d0), "=&D" (d1)
 369                :"a" (0),"1" (page),"0" (1024)
 370                :"memory");
 371}
 372 
 373void mmx_clear_page(void * page)
 374{
 375        if(unlikely(in_interrupt()))
 376                slow_zero_page(page);
 377        else
 378                fast_clear_page(page);
 379}
 380
 381static void slow_copy_page(void *to, void *from)
 382{
 383        int d0, d1, d2;
 384        __asm__ __volatile__( \
 385                "cld\n\t" \
 386                "rep ; movsl" \
 387                : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
 388                : "0" (1024),"1" ((long) to),"2" ((long) from) \
 389                : "memory");
 390}
 391  
 392
 393void mmx_copy_page(void *to, void *from)
 394{
 395        if(unlikely(in_interrupt()))
 396                slow_copy_page(to, from);
 397        else
 398                fast_copy_page(to, from);
 399}
 400
 401EXPORT_SYMBOL(_mmx_memcpy);
 402EXPORT_SYMBOL(mmx_clear_page);
 403EXPORT_SYMBOL(mmx_copy_page);
 404
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.