linux/arch/x86/include/asm/xor_32.h
<<
>>
Prefs
   1#ifndef _ASM_X86_XOR_32_H
   2#define _ASM_X86_XOR_32_H
   3
   4/*
   5 * Optimized RAID-5 checksumming functions for MMX and SSE.
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2, or (at your option)
  10 * any later version.
  11 *
  12 * You should have received a copy of the GNU General Public License
  13 * (for example /usr/src/linux/COPYING); if not, write to the Free
  14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15 */
  16
  17/*
  18 * High-speed RAID5 checksumming functions utilizing MMX instructions.
  19 * Copyright (C) 1998 Ingo Molnar.
  20 */
  21
  22#define LD(x, y)        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
  23#define ST(x, y)        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
  24#define XO1(x, y)       "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
  25#define XO2(x, y)       "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
  26#define XO3(x, y)       "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
  27#define XO4(x, y)       "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
  28
  29#include <asm/i387.h>
  30
  31static void
  32xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  33{
  34        unsigned long lines = bytes >> 7;
  35
  36        kernel_fpu_begin();
  37
  38        asm volatile(
  39#undef BLOCK
  40#define BLOCK(i)                                \
  41        LD(i, 0)                                \
  42                LD(i + 1, 1)                    \
  43                        LD(i + 2, 2)            \
  44                                LD(i + 3, 3)    \
  45        XO1(i, 0)                               \
  46        ST(i, 0)                                \
  47                XO1(i+1, 1)                     \
  48                ST(i+1, 1)                      \
  49                        XO1(i + 2, 2)           \
  50                        ST(i + 2, 2)            \
  51                                XO1(i + 3, 3)   \
  52                                ST(i + 3, 3)
  53
  54        " .align 32                     ;\n"
  55        " 1:                            ;\n"
  56
  57        BLOCK(0)
  58        BLOCK(4)
  59        BLOCK(8)
  60        BLOCK(12)
  61
  62        "       addl $128, %1         ;\n"
  63        "       addl $128, %2         ;\n"
  64        "       decl %0               ;\n"
  65        "       jnz 1b                ;\n"
  66        : "+r" (lines),
  67          "+r" (p1), "+r" (p2)
  68        :
  69        : "memory");
  70
  71        kernel_fpu_end();
  72}
  73
  74static void
  75xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  76              unsigned long *p3)
  77{
  78        unsigned long lines = bytes >> 7;
  79
  80        kernel_fpu_begin();
  81
  82        asm volatile(
  83#undef BLOCK
  84#define BLOCK(i)                                \
  85        LD(i, 0)                                \
  86                LD(i + 1, 1)                    \
  87                        LD(i + 2, 2)            \
  88                                LD(i + 3, 3)    \
  89        XO1(i, 0)                               \
  90                XO1(i + 1, 1)                   \
  91                        XO1(i + 2, 2)           \
  92                                XO1(i + 3, 3)   \
  93        XO2(i, 0)                               \
  94        ST(i, 0)                                \
  95                XO2(i + 1, 1)                   \
  96                ST(i + 1, 1)                    \
  97                        XO2(i + 2, 2)           \
  98                        ST(i + 2, 2)            \
  99                                XO2(i + 3, 3)   \
 100                                ST(i + 3, 3)
 101
 102        " .align 32                     ;\n"
 103        " 1:                            ;\n"
 104
 105        BLOCK(0)
 106        BLOCK(4)
 107        BLOCK(8)
 108        BLOCK(12)
 109
 110        "       addl $128, %1         ;\n"
 111        "       addl $128, %2         ;\n"
 112        "       addl $128, %3         ;\n"
 113        "       decl %0               ;\n"
 114        "       jnz 1b                ;\n"
 115        : "+r" (lines),
 116          "+r" (p1), "+r" (p2), "+r" (p3)
 117        :
 118        : "memory");
 119
 120        kernel_fpu_end();
 121}
 122
 123static void
 124xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 125              unsigned long *p3, unsigned long *p4)
 126{
 127        unsigned long lines = bytes >> 7;
 128
 129        kernel_fpu_begin();
 130
 131        asm volatile(
 132#undef BLOCK
 133#define BLOCK(i)                                \
 134        LD(i, 0)                                \
 135                LD(i + 1, 1)                    \
 136                        LD(i + 2, 2)            \
 137                                LD(i + 3, 3)    \
 138        XO1(i, 0)                               \
 139                XO1(i + 1, 1)                   \
 140                        XO1(i + 2, 2)           \
 141                                XO1(i + 3, 3)   \
 142        XO2(i, 0)                               \
 143                XO2(i + 1, 1)                   \
 144                        XO2(i + 2, 2)           \
 145                                XO2(i + 3, 3)   \
 146        XO3(i, 0)                               \
 147        ST(i, 0)                                \
 148                XO3(i + 1, 1)                   \
 149                ST(i + 1, 1)                    \
 150                        XO3(i + 2, 2)           \
 151                        ST(i + 2, 2)            \
 152                                XO3(i + 3, 3)   \
 153                                ST(i + 3, 3)
 154
 155        " .align 32                     ;\n"
 156        " 1:                            ;\n"
 157
 158        BLOCK(0)
 159        BLOCK(4)
 160        BLOCK(8)
 161        BLOCK(12)
 162
 163        "       addl $128, %1         ;\n"
 164        "       addl $128, %2         ;\n"
 165        "       addl $128, %3         ;\n"
 166        "       addl $128, %4         ;\n"
 167        "       decl %0               ;\n"
 168        "       jnz 1b                ;\n"
 169        : "+r" (lines),
 170          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 171        :
 172        : "memory");
 173
 174        kernel_fpu_end();
 175}
 176
 177
 178static void
 179xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 180              unsigned long *p3, unsigned long *p4, unsigned long *p5)
 181{
 182        unsigned long lines = bytes >> 7;
 183
 184        kernel_fpu_begin();
 185
 186        /* Make sure GCC forgets anything it knows about p4 or p5,
 187           such that it won't pass to the asm volatile below a
 188           register that is shared with any other variable.  That's
 189           because we modify p4 and p5 there, but we can't mark them
 190           as read/write, otherwise we'd overflow the 10-asm-operands
 191           limit of GCC < 3.1.  */
 192        asm("" : "+r" (p4), "+r" (p5));
 193
 194        asm volatile(
 195#undef BLOCK
 196#define BLOCK(i)                                \
 197        LD(i, 0)                                \
 198                LD(i + 1, 1)                    \
 199                        LD(i + 2, 2)            \
 200                                LD(i + 3, 3)    \
 201        XO1(i, 0)                               \
 202                XO1(i + 1, 1)                   \
 203                        XO1(i + 2, 2)           \
 204                                XO1(i + 3, 3)   \
 205        XO2(i, 0)                               \
 206                XO2(i + 1, 1)                   \
 207                        XO2(i + 2, 2)           \
 208                                XO2(i + 3, 3)   \
 209        XO3(i, 0)                               \
 210                XO3(i + 1, 1)                   \
 211                        XO3(i + 2, 2)           \
 212                                XO3(i + 3, 3)   \
 213        XO4(i, 0)                               \
 214        ST(i, 0)                                \
 215                XO4(i + 1, 1)                   \
 216                ST(i + 1, 1)                    \
 217                        XO4(i + 2, 2)           \
 218                        ST(i + 2, 2)            \
 219                                XO4(i + 3, 3)   \
 220                                ST(i + 3, 3)
 221
 222        " .align 32                     ;\n"
 223        " 1:                            ;\n"
 224
 225        BLOCK(0)
 226        BLOCK(4)
 227        BLOCK(8)
 228        BLOCK(12)
 229
 230        "       addl $128, %1         ;\n"
 231        "       addl $128, %2         ;\n"
 232        "       addl $128, %3         ;\n"
 233        "       addl $128, %4         ;\n"
 234        "       addl $128, %5         ;\n"
 235        "       decl %0               ;\n"
 236        "       jnz 1b                ;\n"
 237        : "+r" (lines),
 238          "+r" (p1), "+r" (p2), "+r" (p3)
 239        : "r" (p4), "r" (p5)
 240        : "memory");
 241
 242        /* p4 and p5 were modified, and now the variables are dead.
 243           Clobber them just to be sure nobody does something stupid
 244           like assuming they have some legal value.  */
 245        asm("" : "=r" (p4), "=r" (p5));
 246
 247        kernel_fpu_end();
 248}
 249
 250#undef LD
 251#undef XO1
 252#undef XO2
 253#undef XO3
 254#undef XO4
 255#undef ST
 256#undef BLOCK
 257
 258static void
 259xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 260{
 261        unsigned long lines = bytes >> 6;
 262
 263        kernel_fpu_begin();
 264
 265        asm volatile(
 266        " .align 32                  ;\n"
 267        " 1:                         ;\n"
 268        "       movq   (%1), %%mm0   ;\n"
 269        "       movq  8(%1), %%mm1   ;\n"
 270        "       pxor   (%2), %%mm0   ;\n"
 271        "       movq 16(%1), %%mm2   ;\n"
 272        "       movq %%mm0,   (%1)   ;\n"
 273        "       pxor  8(%2), %%mm1   ;\n"
 274        "       movq 24(%1), %%mm3   ;\n"
 275        "       movq %%mm1,  8(%1)   ;\n"
 276        "       pxor 16(%2), %%mm2   ;\n"
 277        "       movq 32(%1), %%mm4   ;\n"
 278        "       movq %%mm2, 16(%1)   ;\n"
 279        "       pxor 24(%2), %%mm3   ;\n"
 280        "       movq 40(%1), %%mm5   ;\n"
 281        "       movq %%mm3, 24(%1)   ;\n"
 282        "       pxor 32(%2), %%mm4   ;\n"
 283        "       movq 48(%1), %%mm6   ;\n"
 284        "       movq %%mm4, 32(%1)   ;\n"
 285        "       pxor 40(%2), %%mm5   ;\n"
 286        "       movq 56(%1), %%mm7   ;\n"
 287        "       movq %%mm5, 40(%1)   ;\n"
 288        "       pxor 48(%2), %%mm6   ;\n"
 289        "       pxor 56(%2), %%mm7   ;\n"
 290        "       movq %%mm6, 48(%1)   ;\n"
 291        "       movq %%mm7, 56(%1)   ;\n"
 292
 293        "       addl $64, %1         ;\n"
 294        "       addl $64, %2         ;\n"
 295        "       decl %0              ;\n"
 296        "       jnz 1b               ;\n"
 297        : "+r" (lines),
 298          "+r" (p1), "+r" (p2)
 299        :
 300        : "memory");
 301
 302        kernel_fpu_end();
 303}
 304
 305static void
 306xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 307             unsigned long *p3)
 308{
 309        unsigned long lines = bytes >> 6;
 310
 311        kernel_fpu_begin();
 312
 313        asm volatile(
 314        " .align 32,0x90             ;\n"
 315        " 1:                         ;\n"
 316        "       movq   (%1), %%mm0   ;\n"
 317        "       movq  8(%1), %%mm1   ;\n"
 318        "       pxor   (%2), %%mm0   ;\n"
 319        "       movq 16(%1), %%mm2   ;\n"
 320        "       pxor  8(%2), %%mm1   ;\n"
 321        "       pxor   (%3), %%mm0   ;\n"
 322        "       pxor 16(%2), %%mm2   ;\n"
 323        "       movq %%mm0,   (%1)   ;\n"
 324        "       pxor  8(%3), %%mm1   ;\n"
 325        "       pxor 16(%3), %%mm2   ;\n"
 326        "       movq 24(%1), %%mm3   ;\n"
 327        "       movq %%mm1,  8(%1)   ;\n"
 328        "       movq 32(%1), %%mm4   ;\n"
 329        "       movq 40(%1), %%mm5   ;\n"
 330        "       pxor 24(%2), %%mm3   ;\n"
 331        "       movq %%mm2, 16(%1)   ;\n"
 332        "       pxor 32(%2), %%mm4   ;\n"
 333        "       pxor 24(%3), %%mm3   ;\n"
 334        "       pxor 40(%2), %%mm5   ;\n"
 335        "       movq %%mm3, 24(%1)   ;\n"
 336        "       pxor 32(%3), %%mm4   ;\n"
 337        "       pxor 40(%3), %%mm5   ;\n"
 338        "       movq 48(%1), %%mm6   ;\n"
 339        "       movq %%mm4, 32(%1)   ;\n"
 340        "       movq 56(%1), %%mm7   ;\n"
 341        "       pxor 48(%2), %%mm6   ;\n"
 342        "       movq %%mm5, 40(%1)   ;\n"
 343        "       pxor 56(%2), %%mm7   ;\n"
 344        "       pxor 48(%3), %%mm6   ;\n"
 345        "       pxor 56(%3), %%mm7   ;\n"
 346        "       movq %%mm6, 48(%1)   ;\n"
 347        "       movq %%mm7, 56(%1)   ;\n"
 348
 349        "       addl $64, %1         ;\n"
 350        "       addl $64, %2         ;\n"
 351        "       addl $64, %3         ;\n"
 352        "       decl %0              ;\n"
 353        "       jnz 1b               ;\n"
 354        : "+r" (lines),
 355          "+r" (p1), "+r" (p2), "+r" (p3)
 356        :
 357        : "memory" );
 358
 359        kernel_fpu_end();
 360}
 361
 362static void
 363xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 364             unsigned long *p3, unsigned long *p4)
 365{
 366        unsigned long lines = bytes >> 6;
 367
 368        kernel_fpu_begin();
 369
 370        asm volatile(
 371        " .align 32,0x90             ;\n"
 372        " 1:                         ;\n"
 373        "       movq   (%1), %%mm0   ;\n"
 374        "       movq  8(%1), %%mm1   ;\n"
 375        "       pxor   (%2), %%mm0   ;\n"
 376        "       movq 16(%1), %%mm2   ;\n"
 377        "       pxor  8(%2), %%mm1   ;\n"
 378        "       pxor   (%3), %%mm0   ;\n"
 379        "       pxor 16(%2), %%mm2   ;\n"
 380        "       pxor  8(%3), %%mm1   ;\n"
 381        "       pxor   (%4), %%mm0   ;\n"
 382        "       movq 24(%1), %%mm3   ;\n"
 383        "       pxor 16(%3), %%mm2   ;\n"
 384        "       pxor  8(%4), %%mm1   ;\n"
 385        "       movq %%mm0,   (%1)   ;\n"
 386        "       movq 32(%1), %%mm4   ;\n"
 387        "       pxor 24(%2), %%mm3   ;\n"
 388        "       pxor 16(%4), %%mm2   ;\n"
 389        "       movq %%mm1,  8(%1)   ;\n"
 390        "       movq 40(%1), %%mm5   ;\n"
 391        "       pxor 32(%2), %%mm4   ;\n"
 392        "       pxor 24(%3), %%mm3   ;\n"
 393        "       movq %%mm2, 16(%1)   ;\n"
 394        "       pxor 40(%2), %%mm5   ;\n"
 395        "       pxor 32(%3), %%mm4   ;\n"
 396        "       pxor 24(%4), %%mm3   ;\n"
 397        "       movq %%mm3, 24(%1)   ;\n"
 398        "       movq 56(%1), %%mm7   ;\n"
 399        "       movq 48(%1), %%mm6   ;\n"
 400        "       pxor 40(%3), %%mm5   ;\n"
 401        "       pxor 32(%4), %%mm4   ;\n"
 402        "       pxor 48(%2), %%mm6   ;\n"
 403        "       movq %%mm4, 32(%1)   ;\n"
 404        "       pxor 56(%2), %%mm7   ;\n"
 405        "       pxor 40(%4), %%mm5   ;\n"
 406        "       pxor 48(%3), %%mm6   ;\n"
 407        "       pxor 56(%3), %%mm7   ;\n"
 408        "       movq %%mm5, 40(%1)   ;\n"
 409        "       pxor 48(%4), %%mm6   ;\n"
 410        "       pxor 56(%4), %%mm7   ;\n"
 411        "       movq %%mm6, 48(%1)   ;\n"
 412        "       movq %%mm7, 56(%1)   ;\n"
 413
 414        "       addl $64, %1         ;\n"
 415        "       addl $64, %2         ;\n"
 416        "       addl $64, %3         ;\n"
 417        "       addl $64, %4         ;\n"
 418        "       decl %0              ;\n"
 419        "       jnz 1b               ;\n"
 420        : "+r" (lines),
 421          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 422        :
 423        : "memory");
 424
 425        kernel_fpu_end();
 426}
 427
 428static void
 429xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 430             unsigned long *p3, unsigned long *p4, unsigned long *p5)
 431{
 432        unsigned long lines = bytes >> 6;
 433
 434        kernel_fpu_begin();
 435
 436        /* Make sure GCC forgets anything it knows about p4 or p5,
 437           such that it won't pass to the asm volatile below a
 438           register that is shared with any other variable.  That's
 439           because we modify p4 and p5 there, but we can't mark them
 440           as read/write, otherwise we'd overflow the 10-asm-operands
 441           limit of GCC < 3.1.  */
 442        asm("" : "+r" (p4), "+r" (p5));
 443
 444        asm volatile(
 445        " .align 32,0x90             ;\n"
 446        " 1:                         ;\n"
 447        "       movq   (%1), %%mm0   ;\n"
 448        "       movq  8(%1), %%mm1   ;\n"
 449        "       pxor   (%2), %%mm0   ;\n"
 450        "       pxor  8(%2), %%mm1   ;\n"
 451        "       movq 16(%1), %%mm2   ;\n"
 452        "       pxor   (%3), %%mm0   ;\n"
 453        "       pxor  8(%3), %%mm1   ;\n"
 454        "       pxor 16(%2), %%mm2   ;\n"
 455        "       pxor   (%4), %%mm0   ;\n"
 456        "       pxor  8(%4), %%mm1   ;\n"
 457        "       pxor 16(%3), %%mm2   ;\n"
 458        "       movq 24(%1), %%mm3   ;\n"
 459        "       pxor   (%5), %%mm0   ;\n"
 460        "       pxor  8(%5), %%mm1   ;\n"
 461        "       movq %%mm0,   (%1)   ;\n"
 462        "       pxor 16(%4), %%mm2   ;\n"
 463        "       pxor 24(%2), %%mm3   ;\n"
 464        "       movq %%mm1,  8(%1)   ;\n"
 465        "       pxor 16(%5), %%mm2   ;\n"
 466        "       pxor 24(%3), %%mm3   ;\n"
 467        "       movq 32(%1), %%mm4   ;\n"
 468        "       movq %%mm2, 16(%1)   ;\n"
 469        "       pxor 24(%4), %%mm3   ;\n"
 470        "       pxor 32(%2), %%mm4   ;\n"
 471        "       movq 40(%1), %%mm5   ;\n"
 472        "       pxor 24(%5), %%mm3   ;\n"
 473        "       pxor 32(%3), %%mm4   ;\n"
 474        "       pxor 40(%2), %%mm5   ;\n"
 475        "       movq %%mm3, 24(%1)   ;\n"
 476        "       pxor 32(%4), %%mm4   ;\n"
 477        "       pxor 40(%3), %%mm5   ;\n"
 478        "       movq 48(%1), %%mm6   ;\n"
 479        "       movq 56(%1), %%mm7   ;\n"
 480        "       pxor 32(%5), %%mm4   ;\n"
 481        "       pxor 40(%4), %%mm5   ;\n"
 482        "       pxor 48(%2), %%mm6   ;\n"
 483        "       pxor 56(%2), %%mm7   ;\n"
 484        "       movq %%mm4, 32(%1)   ;\n"
 485        "       pxor 48(%3), %%mm6   ;\n"
 486        "       pxor 56(%3), %%mm7   ;\n"
 487        "       pxor 40(%5), %%mm5   ;\n"
 488        "       pxor 48(%4), %%mm6   ;\n"
 489        "       pxor 56(%4), %%mm7   ;\n"
 490        "       movq %%mm5, 40(%1)   ;\n"
 491        "       pxor 48(%5), %%mm6   ;\n"
 492        "       pxor 56(%5), %%mm7   ;\n"
 493        "       movq %%mm6, 48(%1)   ;\n"
 494        "       movq %%mm7, 56(%1)   ;\n"
 495
 496        "       addl $64, %1         ;\n"
 497        "       addl $64, %2         ;\n"
 498        "       addl $64, %3         ;\n"
 499        "       addl $64, %4         ;\n"
 500        "       addl $64, %5         ;\n"
 501        "       decl %0              ;\n"
 502        "       jnz 1b               ;\n"
 503        : "+r" (lines),
 504          "+r" (p1), "+r" (p2), "+r" (p3)
 505        : "r" (p4), "r" (p5)
 506        : "memory");
 507
 508        /* p4 and p5 were modified, and now the variables are dead.
 509           Clobber them just to be sure nobody does something stupid
 510           like assuming they have some legal value.  */
 511        asm("" : "=r" (p4), "=r" (p5));
 512
 513        kernel_fpu_end();
 514}
 515
 516static struct xor_block_template xor_block_pII_mmx = {
 517        .name = "pII_mmx",
 518        .do_2 = xor_pII_mmx_2,
 519        .do_3 = xor_pII_mmx_3,
 520        .do_4 = xor_pII_mmx_4,
 521        .do_5 = xor_pII_mmx_5,
 522};
 523
 524static struct xor_block_template xor_block_p5_mmx = {
 525        .name = "p5_mmx",
 526        .do_2 = xor_p5_mmx_2,
 527        .do_3 = xor_p5_mmx_3,
 528        .do_4 = xor_p5_mmx_4,
 529        .do_5 = xor_p5_mmx_5,
 530};
 531
 532/*
 533 * Cache avoiding checksumming functions utilizing KNI instructions
 534 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
 535 */
 536
 537#define XMMS_SAVE                               \
 538do {                                            \
 539        preempt_disable();                      \
 540        cr0 = read_cr0();                       \
 541        clts();                                 \
 542        asm volatile(                           \
 543                "movups %%xmm0,(%0)     ;\n\t"  \
 544                "movups %%xmm1,0x10(%0) ;\n\t"  \
 545                "movups %%xmm2,0x20(%0) ;\n\t"  \
 546                "movups %%xmm3,0x30(%0) ;\n\t"  \
 547                :                               \
 548                : "r" (xmm_save)                \
 549                : "memory");                    \
 550} while (0)
 551
 552#define XMMS_RESTORE                            \
 553do {                                            \
 554        asm volatile(                           \
 555                "sfence                 ;\n\t"  \
 556                "movups (%0),%%xmm0     ;\n\t"  \
 557                "movups 0x10(%0),%%xmm1 ;\n\t"  \
 558                "movups 0x20(%0),%%xmm2 ;\n\t"  \
 559                "movups 0x30(%0),%%xmm3 ;\n\t"  \
 560                :                               \
 561                : "r" (xmm_save)                \
 562                : "memory");                    \
 563        write_cr0(cr0);                         \
 564        preempt_enable();                       \
 565} while (0)
 566
 567#define ALIGN16 __attribute__((aligned(16)))
 568
 569#define OFFS(x)         "16*("#x")"
 570#define PF_OFFS(x)      "256+16*("#x")"
 571#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%1)            ;\n"
 572#define LD(x, y)        "       movaps   "OFFS(x)"(%1), %%xmm"#y"       ;\n"
 573#define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%1)       ;\n"
 574#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%2)            ;\n"
 575#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%3)            ;\n"
 576#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%4)            ;\n"
 577#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%5)            ;\n"
 578#define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%6)            ;\n"
 579#define XO1(x, y)       "       xorps   "OFFS(x)"(%2), %%xmm"#y"        ;\n"
 580#define XO2(x, y)       "       xorps   "OFFS(x)"(%3), %%xmm"#y"        ;\n"
 581#define XO3(x, y)       "       xorps   "OFFS(x)"(%4), %%xmm"#y"        ;\n"
 582#define XO4(x, y)       "       xorps   "OFFS(x)"(%5), %%xmm"#y"        ;\n"
 583#define XO5(x, y)       "       xorps   "OFFS(x)"(%6), %%xmm"#y"        ;\n"
 584
 585
 586static void
 587xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 588{
 589        unsigned long lines = bytes >> 8;
 590        char xmm_save[16*4] ALIGN16;
 591        int cr0;
 592
 593        XMMS_SAVE;
 594
 595        asm volatile(
 596#undef BLOCK
 597#define BLOCK(i)                                        \
 598                LD(i, 0)                                \
 599                        LD(i + 1, 1)                    \
 600                PF1(i)                                  \
 601                                PF1(i + 2)              \
 602                                LD(i + 2, 2)            \
 603                                        LD(i + 3, 3)    \
 604                PF0(i + 4)                              \
 605                                PF0(i + 6)              \
 606                XO1(i, 0)                               \
 607                        XO1(i + 1, 1)                   \
 608                                XO1(i + 2, 2)           \
 609                                        XO1(i + 3, 3)   \
 610                ST(i, 0)                                \
 611                        ST(i + 1, 1)                    \
 612                                ST(i + 2, 2)            \
 613                                        ST(i + 3, 3)    \
 614
 615
 616                PF0(0)
 617                                PF0(2)
 618
 619        " .align 32                     ;\n"
 620        " 1:                            ;\n"
 621
 622                BLOCK(0)
 623                BLOCK(4)
 624                BLOCK(8)
 625                BLOCK(12)
 626
 627        "       addl $256, %1           ;\n"
 628        "       addl $256, %2           ;\n"
 629        "       decl %0                 ;\n"
 630        "       jnz 1b                  ;\n"
 631        : "+r" (lines),
 632          "+r" (p1), "+r" (p2)
 633        :
 634        : "memory");
 635
 636        XMMS_RESTORE;
 637}
 638
 639static void
 640xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 641          unsigned long *p3)
 642{
 643        unsigned long lines = bytes >> 8;
 644        char xmm_save[16*4] ALIGN16;
 645        int cr0;
 646
 647        XMMS_SAVE;
 648
 649        asm volatile(
 650#undef BLOCK
 651#define BLOCK(i) \
 652                PF1(i)                                  \
 653                                PF1(i + 2)              \
 654                LD(i,0)                                 \
 655                        LD(i + 1, 1)                    \
 656                                LD(i + 2, 2)            \
 657                                        LD(i + 3, 3)    \
 658                PF2(i)                                  \
 659                                PF2(i + 2)              \
 660                PF0(i + 4)                              \
 661                                PF0(i + 6)              \
 662                XO1(i,0)                                \
 663                        XO1(i + 1, 1)                   \
 664                                XO1(i + 2, 2)           \
 665                                        XO1(i + 3, 3)   \
 666                XO2(i,0)                                \
 667                        XO2(i + 1, 1)                   \
 668                                XO2(i + 2, 2)           \
 669                                        XO2(i + 3, 3)   \
 670                ST(i,0)                                 \
 671                        ST(i + 1, 1)                    \
 672                                ST(i + 2, 2)            \
 673                                        ST(i + 3, 3)    \
 674
 675
 676                PF0(0)
 677                                PF0(2)
 678
 679        " .align 32                     ;\n"
 680        " 1:                            ;\n"
 681
 682                BLOCK(0)
 683                BLOCK(4)
 684                BLOCK(8)
 685                BLOCK(12)
 686
 687        "       addl $256, %1           ;\n"
 688        "       addl $256, %2           ;\n"
 689        "       addl $256, %3           ;\n"
 690        "       decl %0                 ;\n"
 691        "       jnz 1b                  ;\n"
 692        : "+r" (lines),
 693          "+r" (p1), "+r"(p2), "+r"(p3)
 694        :
 695        : "memory" );
 696
 697        XMMS_RESTORE;
 698}
 699
 700static void
 701xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 702          unsigned long *p3, unsigned long *p4)
 703{
 704        unsigned long lines = bytes >> 8;
 705        char xmm_save[16*4] ALIGN16;
 706        int cr0;
 707
 708        XMMS_SAVE;
 709
 710        asm volatile(
 711#undef BLOCK
 712#define BLOCK(i) \
 713                PF1(i)                                  \
 714                                PF1(i + 2)              \
 715                LD(i,0)                                 \
 716                        LD(i + 1, 1)                    \
 717                                LD(i + 2, 2)            \
 718                                        LD(i + 3, 3)    \
 719                PF2(i)                                  \
 720                                PF2(i + 2)              \
 721                XO1(i,0)                                \
 722                        XO1(i + 1, 1)                   \
 723                                XO1(i + 2, 2)           \
 724                                        XO1(i + 3, 3)   \
 725                PF3(i)                                  \
 726                                PF3(i + 2)              \
 727                PF0(i + 4)                              \
 728                                PF0(i + 6)              \
 729                XO2(i,0)                                \
 730                        XO2(i + 1, 1)                   \
 731                                XO2(i + 2, 2)           \
 732                                        XO2(i + 3, 3)   \
 733                XO3(i,0)                                \
 734                        XO3(i + 1, 1)                   \
 735                                XO3(i + 2, 2)           \
 736                                        XO3(i + 3, 3)   \
 737                ST(i,0)                                 \
 738                        ST(i + 1, 1)                    \
 739                                ST(i + 2, 2)            \
 740                                        ST(i + 3, 3)    \
 741
 742
 743                PF0(0)
 744                                PF0(2)
 745
 746        " .align 32                     ;\n"
 747        " 1:                            ;\n"
 748
 749                BLOCK(0)
 750                BLOCK(4)
 751                BLOCK(8)
 752                BLOCK(12)
 753
 754        "       addl $256, %1           ;\n"
 755        "       addl $256, %2           ;\n"
 756        "       addl $256, %3           ;\n"
 757        "       addl $256, %4           ;\n"
 758        "       decl %0                 ;\n"
 759        "       jnz 1b                  ;\n"
 760        : "+r" (lines),
 761          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 762        :
 763        : "memory" );
 764
 765        XMMS_RESTORE;
 766}
 767
 768static void
 769xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 770          unsigned long *p3, unsigned long *p4, unsigned long *p5)
 771{
 772        unsigned long lines = bytes >> 8;
 773        char xmm_save[16*4] ALIGN16;
 774        int cr0;
 775
 776        XMMS_SAVE;
 777
 778        /* Make sure GCC forgets anything it knows about p4 or p5,
 779           such that it won't pass to the asm volatile below a
 780           register that is shared with any other variable.  That's
 781           because we modify p4 and p5 there, but we can't mark them
 782           as read/write, otherwise we'd overflow the 10-asm-operands
 783           limit of GCC < 3.1.  */
 784        asm("" : "+r" (p4), "+r" (p5));
 785
 786        asm volatile(
 787#undef BLOCK
 788#define BLOCK(i) \
 789                PF1(i)                                  \
 790                                PF1(i + 2)              \
 791                LD(i,0)                                 \
 792                        LD(i + 1, 1)                    \
 793                                LD(i + 2, 2)            \
 794                                        LD(i + 3, 3)    \
 795                PF2(i)                                  \
 796                                PF2(i + 2)              \
 797                XO1(i,0)                                \
 798                        XO1(i + 1, 1)                   \
 799                                XO1(i + 2, 2)           \
 800                                        XO1(i + 3, 3)   \
 801                PF3(i)                                  \
 802                                PF3(i + 2)              \
 803                XO2(i,0)                                \
 804                        XO2(i + 1, 1)                   \
 805                                XO2(i + 2, 2)           \
 806                                        XO2(i + 3, 3)   \
 807                PF4(i)                                  \
 808                                PF4(i + 2)              \
 809                PF0(i + 4)                              \
 810                                PF0(i + 6)              \
 811                XO3(i,0)                                \
 812                        XO3(i + 1, 1)                   \
 813                                XO3(i + 2, 2)           \
 814                                        XO3(i + 3, 3)   \
 815                XO4(i,0)                                \
 816                        XO4(i + 1, 1)                   \
 817                                XO4(i + 2, 2)           \
 818                                        XO4(i + 3, 3)   \
 819                ST(i,0)                                 \
 820                        ST(i + 1, 1)                    \
 821                                ST(i + 2, 2)            \
 822                                        ST(i + 3, 3)    \
 823
 824
 825                PF0(0)
 826                                PF0(2)
 827
 828        " .align 32                     ;\n"
 829        " 1:                            ;\n"
 830
 831                BLOCK(0)
 832                BLOCK(4)
 833                BLOCK(8)
 834                BLOCK(12)
 835
 836        "       addl $256, %1           ;\n"
 837        "       addl $256, %2           ;\n"
 838        "       addl $256, %3           ;\n"
 839        "       addl $256, %4           ;\n"
 840        "       addl $256, %5           ;\n"
 841        "       decl %0                 ;\n"
 842        "       jnz 1b                  ;\n"
 843        : "+r" (lines),
 844          "+r" (p1), "+r" (p2), "+r" (p3)
 845        : "r" (p4), "r" (p5)
 846        : "memory");
 847
 848        /* p4 and p5 were modified, and now the variables are dead.
 849           Clobber them just to be sure nobody does something stupid
 850           like assuming they have some legal value.  */
 851        asm("" : "=r" (p4), "=r" (p5));
 852
 853        XMMS_RESTORE;
 854}
 855
 856static struct xor_block_template xor_block_pIII_sse = {
 857        .name = "pIII_sse",
 858        .do_2 = xor_sse_2,
 859        .do_3 = xor_sse_3,
 860        .do_4 = xor_sse_4,
 861        .do_5 = xor_sse_5,
 862};
 863
 864/* Also try the generic routines.  */
 865#include <asm-generic/xor.h>
 866
 867#undef XOR_TRY_TEMPLATES
 868#define XOR_TRY_TEMPLATES                               \
 869do {                                                    \
 870        xor_speed(&xor_block_8regs);                    \
 871        xor_speed(&xor_block_8regs_p);                  \
 872        xor_speed(&xor_block_32regs);                   \
 873        xor_speed(&xor_block_32regs_p);                 \
 874        if (cpu_has_xmm)                                \
 875                xor_speed(&xor_block_pIII_sse);         \
 876        if (cpu_has_mmx) {                              \
 877                xor_speed(&xor_block_pII_mmx);          \
 878                xor_speed(&xor_block_p5_mmx);           \
 879        }                                               \
 880} while (0)
 881
 882/* We force the use of the SSE xor block because it can write around L2.
 883   We may also be able to load into the L1 only depending on how the cpu
 884   deals with a load to a line that is being prefetched.  */
 885#define XOR_SELECT_TEMPLATE(FASTEST)                    \
 886        (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
 887
 888#endif /* _ASM_X86_XOR_32_H */
 889
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.