linux/arch/x86/lib/mmx_32.c
<<
>>
Prefs
   1#include <linux/types.h>
   2#include <linux/string.h>
   3#include <linux/sched.h>
   4#include <linux/hardirq.h>
   5#include <linux/module.h>
   6
   7#include <asm/asm.h>
   8#include <asm/i387.h>
   9
  10
  11/*
  12 *      MMX 3DNow! library helper functions
  13 *
  14 *      To do:
  15 *      We can use MMX just for prefetch in IRQ's. This may be a win. 
  16 *              (reported so on K6-III)
  17 *      We should use a better code neutral filler for the short jump
  18 *              leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
  19 *      We also want to clobber the filler register so we don't get any
  20 *              register forwarding stalls on the filler. 
  21 *
  22 *      Add *user handling. Checksums are not a win with MMX on any CPU
  23 *      tested so far for any MMX solution figured.
  24 *
  25 *      22/09/2000 - Arjan van de Ven 
  26 *              Improved for non-egineering-sample Athlons 
  27 *
  28 */
  29 
  30void *_mmx_memcpy(void *to, const void *from, size_t len)
  31{
  32        void *p;
  33        int i;
  34
  35        if (unlikely(in_interrupt()))
  36                return __memcpy(to, from, len);
  37
  38        p = to;
  39        i = len >> 6; /* len/64 */
  40
  41        kernel_fpu_begin();
  42
  43        __asm__ __volatile__ (
  44                "1: prefetch (%0)\n"            /* This set is 28 bytes */
  45                "   prefetch 64(%0)\n"
  46                "   prefetch 128(%0)\n"
  47                "   prefetch 192(%0)\n"
  48                "   prefetch 256(%0)\n"
  49                "2:  \n"
  50                ".section .fixup, \"ax\"\n"
  51                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  52                "   jmp 2b\n"
  53                ".previous\n"
  54                _ASM_EXTABLE(1b,3b)
  55                : : "r" (from) );
  56                
  57        
  58        for(; i>5; i--)
  59        {
  60                __asm__ __volatile__ (
  61                "1:  prefetch 320(%0)\n"
  62                "2:  movq (%0), %%mm0\n"
  63                "  movq 8(%0), %%mm1\n"
  64                "  movq 16(%0), %%mm2\n"
  65                "  movq 24(%0), %%mm3\n"
  66                "  movq %%mm0, (%1)\n"
  67                "  movq %%mm1, 8(%1)\n"
  68                "  movq %%mm2, 16(%1)\n"
  69                "  movq %%mm3, 24(%1)\n"
  70                "  movq 32(%0), %%mm0\n"
  71                "  movq 40(%0), %%mm1\n"
  72                "  movq 48(%0), %%mm2\n"
  73                "  movq 56(%0), %%mm3\n"
  74                "  movq %%mm0, 32(%1)\n"
  75                "  movq %%mm1, 40(%1)\n"
  76                "  movq %%mm2, 48(%1)\n"
  77                "  movq %%mm3, 56(%1)\n"
  78                ".section .fixup, \"ax\"\n"
  79                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  80                "   jmp 2b\n"
  81                ".previous\n"
  82                _ASM_EXTABLE(1b,3b)
  83                : : "r" (from), "r" (to) : "memory");
  84                from+=64;
  85                to+=64;
  86        }
  87
  88        for(; i>0; i--)
  89        {
  90                __asm__ __volatile__ (
  91                "  movq (%0), %%mm0\n"
  92                "  movq 8(%0), %%mm1\n"
  93                "  movq 16(%0), %%mm2\n"
  94                "  movq 24(%0), %%mm3\n"
  95                "  movq %%mm0, (%1)\n"
  96                "  movq %%mm1, 8(%1)\n"
  97                "  movq %%mm2, 16(%1)\n"
  98                "  movq %%mm3, 24(%1)\n"
  99                "  movq 32(%0), %%mm0\n"
 100                "  movq 40(%0), %%mm1\n"
 101                "  movq 48(%0), %%mm2\n"
 102                "  movq 56(%0), %%mm3\n"
 103                "  movq %%mm0, 32(%1)\n"
 104                "  movq %%mm1, 40(%1)\n"
 105                "  movq %%mm2, 48(%1)\n"
 106                "  movq %%mm3, 56(%1)\n"
 107                : : "r" (from), "r" (to) : "memory");
 108                from+=64;
 109                to+=64;
 110        }
 111        /*
 112         *      Now do the tail of the block
 113         */
 114        __memcpy(to, from, len&63);
 115        kernel_fpu_end();
 116        return p;
 117}
 118
 119#ifdef CONFIG_MK7
 120
 121/*
 122 *      The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
 123 *      other MMX using processors do not.
 124 */
 125
 126static void fast_clear_page(void *page)
 127{
 128        int i;
 129
 130        kernel_fpu_begin();
 131        
 132        __asm__ __volatile__ (
 133                "  pxor %%mm0, %%mm0\n" : :
 134        );
 135
 136        for(i=0;i<4096/64;i++)
 137        {
 138                __asm__ __volatile__ (
 139                "  movntq %%mm0, (%0)\n"
 140                "  movntq %%mm0, 8(%0)\n"
 141                "  movntq %%mm0, 16(%0)\n"
 142                "  movntq %%mm0, 24(%0)\n"
 143                "  movntq %%mm0, 32(%0)\n"
 144                "  movntq %%mm0, 40(%0)\n"
 145                "  movntq %%mm0, 48(%0)\n"
 146                "  movntq %%mm0, 56(%0)\n"
 147                : : "r" (page) : "memory");
 148                page+=64;
 149        }
 150        /* since movntq is weakly-ordered, a "sfence" is needed to become
 151         * ordered again.
 152         */
 153        __asm__ __volatile__ (
 154                "  sfence \n" : :
 155        );
 156        kernel_fpu_end();
 157}
 158
 159static void fast_copy_page(void *to, void *from)
 160{
 161        int i;
 162
 163        kernel_fpu_begin();
 164
 165        /* maybe the prefetch stuff can go before the expensive fnsave...
 166         * but that is for later. -AV
 167         */
 168        __asm__ __volatile__ (
 169                "1: prefetch (%0)\n"
 170                "   prefetch 64(%0)\n"
 171                "   prefetch 128(%0)\n"
 172                "   prefetch 192(%0)\n"
 173                "   prefetch 256(%0)\n"
 174                "2:  \n"
 175                ".section .fixup, \"ax\"\n"
 176                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
 177                "   jmp 2b\n"
 178                ".previous\n"
 179                _ASM_EXTABLE(1b,3b)
 180                : : "r" (from) );
 181
 182        for(i=0; i<(4096-320)/64; i++)
 183        {
 184                __asm__ __volatile__ (
 185                "1: prefetch 320(%0)\n"
 186                "2: movq (%0), %%mm0\n"
 187                "   movntq %%mm0, (%1)\n"
 188                "   movq 8(%0), %%mm1\n"
 189                "   movntq %%mm1, 8(%1)\n"
 190                "   movq 16(%0), %%mm2\n"
 191                "   movntq %%mm2, 16(%1)\n"
 192                "   movq 24(%0), %%mm3\n"
 193                "   movntq %%mm3, 24(%1)\n"
 194                "   movq 32(%0), %%mm4\n"
 195                "   movntq %%mm4, 32(%1)\n"
 196                "   movq 40(%0), %%mm5\n"
 197                "   movntq %%mm5, 40(%1)\n"
 198                "   movq 48(%0), %%mm6\n"
 199                "   movntq %%mm6, 48(%1)\n"
 200                "   movq 56(%0), %%mm7\n"
 201                "   movntq %%mm7, 56(%1)\n"
 202                ".section .fixup, \"ax\"\n"
 203                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
 204                "   jmp 2b\n"
 205                ".previous\n"
 206                _ASM_EXTABLE(1b,3b)
 207                : : "r" (from), "r" (to) : "memory");
 208                from+=64;
 209                to+=64;
 210        }
 211        for(i=(4096-320)/64; i<4096/64; i++)
 212        {
 213                __asm__ __volatile__ (
 214                "2: movq (%0), %%mm0\n"
 215                "   movntq %%mm0, (%1)\n"
 216                "   movq 8(%0), %%mm1\n"
 217                "   movntq %%mm1, 8(%1)\n"
 218                "   movq 16(%0), %%mm2\n"
 219                "   movntq %%mm2, 16(%1)\n"
 220                "   movq 24(%0), %%mm3\n"
 221                "   movntq %%mm3, 24(%1)\n"
 222                "   movq 32(%0), %%mm4\n"
 223                "   movntq %%mm4, 32(%1)\n"
 224                "   movq 40(%0), %%mm5\n"
 225                "   movntq %%mm5, 40(%1)\n"
 226                "   movq 48(%0), %%mm6\n"
 227                "   movntq %%mm6, 48(%1)\n"
 228                "   movq 56(%0), %%mm7\n"
 229                "   movntq %%mm7, 56(%1)\n"
 230                : : "r" (from), "r" (to) : "memory");
 231                from+=64;
 232                to+=64;
 233        }
 234        /* since movntq is weakly-ordered, a "sfence" is needed to become
 235         * ordered again.
 236         */
 237        __asm__ __volatile__ (
 238                "  sfence \n" : :
 239        );
 240        kernel_fpu_end();
 241}
 242
 243#else
 244
 245/*
 246 *      Generic MMX implementation without K7 specific streaming
 247 */
 248 
 249static void fast_clear_page(void *page)
 250{
 251        int i;
 252        
 253        kernel_fpu_begin();
 254        
 255        __asm__ __volatile__ (
 256                "  pxor %%mm0, %%mm0\n" : :
 257        );
 258
 259        for(i=0;i<4096/128;i++)
 260        {
 261                __asm__ __volatile__ (
 262                "  movq %%mm0, (%0)\n"
 263                "  movq %%mm0, 8(%0)\n"
 264                "  movq %%mm0, 16(%0)\n"
 265                "  movq %%mm0, 24(%0)\n"
 266                "  movq %%mm0, 32(%0)\n"
 267                "  movq %%mm0, 40(%0)\n"
 268                "  movq %%mm0, 48(%0)\n"
 269                "  movq %%mm0, 56(%0)\n"
 270                "  movq %%mm0, 64(%0)\n"
 271                "  movq %%mm0, 72(%0)\n"
 272                "  movq %%mm0, 80(%0)\n"
 273                "  movq %%mm0, 88(%0)\n"
 274                "  movq %%mm0, 96(%0)\n"
 275                "  movq %%mm0, 104(%0)\n"
 276                "  movq %%mm0, 112(%0)\n"
 277                "  movq %%mm0, 120(%0)\n"
 278                : : "r" (page) : "memory");
 279                page+=128;
 280        }
 281
 282        kernel_fpu_end();
 283}
 284
 285static void fast_copy_page(void *to, void *from)
 286{
 287        int i;
 288        
 289        
 290        kernel_fpu_begin();
 291
 292        __asm__ __volatile__ (
 293                "1: prefetch (%0)\n"
 294                "   prefetch 64(%0)\n"
 295                "   prefetch 128(%0)\n"
 296                "   prefetch 192(%0)\n"
 297                "   prefetch 256(%0)\n"
 298                "2:  \n"
 299                ".section .fixup, \"ax\"\n"
 300                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
 301                "   jmp 2b\n"
 302                ".previous\n"
 303                _ASM_EXTABLE(1b,3b)
 304                : : "r" (from) );
 305
 306        for(i=0; i<4096/64; i++)
 307        {
 308                __asm__ __volatile__ (
 309                "1: prefetch 320(%0)\n"
 310                "2: movq (%0), %%mm0\n"
 311                "   movq 8(%0), %%mm1\n"
 312                "   movq 16(%0), %%mm2\n"
 313                "   movq 24(%0), %%mm3\n"
 314                "   movq %%mm0, (%1)\n"
 315                "   movq %%mm1, 8(%1)\n"
 316                "   movq %%mm2, 16(%1)\n"
 317                "   movq %%mm3, 24(%1)\n"
 318                "   movq 32(%0), %%mm0\n"
 319                "   movq 40(%0), %%mm1\n"
 320                "   movq 48(%0), %%mm2\n"
 321                "   movq 56(%0), %%mm3\n"
 322                "   movq %%mm0, 32(%1)\n"
 323                "   movq %%mm1, 40(%1)\n"
 324                "   movq %%mm2, 48(%1)\n"
 325                "   movq %%mm3, 56(%1)\n"
 326                ".section .fixup, \"ax\"\n"
 327                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
 328                "   jmp 2b\n"
 329                ".previous\n"
 330                _ASM_EXTABLE(1b,3b)
 331                : : "r" (from), "r" (to) : "memory");
 332                from+=64;
 333                to+=64;
 334        }
 335        kernel_fpu_end();
 336}
 337
 338
 339#endif
 340
 341/*
 342 *      Favour MMX for page clear and copy. 
 343 */
 344
 345static void slow_zero_page(void * page)
 346{
 347        int d0, d1;
 348        __asm__ __volatile__( \
 349                "cld\n\t" \
 350                "rep ; stosl" \
 351                : "=&c" (d0), "=&D" (d1)
 352                :"a" (0),"1" (page),"0" (1024)
 353                :"memory");
 354}
 355 
 356void mmx_clear_page(void * page)
 357{
 358        if(unlikely(in_interrupt()))
 359                slow_zero_page(page);
 360        else
 361                fast_clear_page(page);
 362}
 363
 364static void slow_copy_page(void *to, void *from)
 365{
 366        int d0, d1, d2;
 367        __asm__ __volatile__( \
 368                "cld\n\t" \
 369                "rep ; movsl" \
 370                : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
 371                : "0" (1024),"1" ((long) to),"2" ((long) from) \
 372                : "memory");
 373}
 374  
 375
 376void mmx_copy_page(void *to, void *from)
 377{
 378        if(unlikely(in_interrupt()))
 379                slow_copy_page(to, from);
 380        else
 381                fast_copy_page(to, from);
 382}
 383
 384EXPORT_SYMBOL(_mmx_memcpy);
 385EXPORT_SYMBOL(mmx_clear_page);
 386EXPORT_SYMBOL(mmx_copy_page);
 387
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.