linux/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
<<
>>
Prefs
   1#!/usr/bin/env perl
   2# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   3#
   4# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
   5# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
   6# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
   7#
   8# This code is taken from the OpenSSL project but the author, Andy Polyakov,
   9# has relicensed it under the licenses specified in the SPDX header above.
  10# The original headers, including the original license headers, are
  11# included below for completeness.
  12#
  13# ====================================================================
  14# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  15# project. The module is, however, dual licensed under OpenSSL and
  16# CRYPTOGAMS licenses depending on where you obtain it. For further
  17# details see http://www.openssl.org/~appro/cryptogams/.
  18# ====================================================================
  19#
  20# This module implements Poly1305 hash for x86_64.
  21#
  22# March 2015
  23#
  24# Initial release.
  25#
  26# December 2016
  27#
  28# Add AVX512F+VL+BW code path.
  29#
  30# November 2017
  31#
  32# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
  33# executed even on Knights Landing. Trigger for modification was
  34# observation that AVX512 code paths can negatively affect overall
  35# Skylake-X system performance. Since we are likely to suppress
  36# AVX512F capability flag [at least on Skylake-X], conversion serves
  37# as kind of "investment protection". Note that next *lake processor,
  38# Cannonlake, has AVX512IFMA code path to execute...
  39#
  40# Numbers are cycles per processed byte with poly1305_blocks alone,
  41# measured with rdtsc at fixed clock frequency.
  42#
  43#               IALU/gcc-4.8(*) AVX(**)         AVX2    AVX-512
  44# P4            4.46/+120%      -
  45# Core 2        2.41/+90%       -
  46# Westmere      1.88/+120%      -
  47# Sandy Bridge  1.39/+140%      1.10
  48# Haswell       1.14/+175%      1.11            0.65
  49# Skylake[-X]   1.13/+120%      0.96            0.51    [0.35]
  50# Silvermont    2.83/+95%       -
  51# Knights L     3.60/?          1.65            1.10    0.41(***)
  52# Goldmont      1.70/+180%      -
  53# VIA Nano      1.82/+150%      -
  54# Sledgehammer  1.38/+160%      -
  55# Bulldozer     2.30/+130%      0.97
  56# Ryzen         1.15/+200%      1.08            1.18
  57#
  58# (*)   improvement coefficients relative to clang are more modest and
  59#       are ~50% on most processors, in both cases we are comparing to
  60#       __int128 code;
  61# (**)  SSE2 implementation was attempted, but among non-AVX processors
  62#       it was faster than integer-only code only on older Intel P4 and
  63#       Core processors, 50-30%, less newer processor is, but slower on
  64#       contemporary ones, for example almost 2x slower on Atom, and as
  65#       former are naturally disappearing, SSE2 is deemed unnecessary;
  66# (***) strangely enough performance seems to vary from core to core,
  67#       listed result is best case;
  68
  69$flavour = shift;
  70$output  = shift;
  71if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  72
  73$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  74$kernel=0; $kernel=1 if (!$flavour && !$output);
  75
  76if (!$kernel) {
  77        $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  78        ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  79        ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  80        die "can't locate x86_64-xlate.pl";
  81
  82        open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  83        *STDOUT=*OUT;
  84
  85        if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  86            =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  87                $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
  88        }
  89
  90        if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  91            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
  92                $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
  93                $avx += 1 if ($1==2.11 && $2>=8);
  94        }
  95
  96        if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  97            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  98                $avx = ($1>=10) + ($1>=11);
  99        }
 100
 101        if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
 102                $avx = ($2>=3.0) + ($2>3.0);
 103        }
 104} else {
 105        $avx = 4; # The kernel uses ifdefs for this.
 106}
 107
 108sub declare_function() {
 109        my ($name, $align, $nargs) = @_;
 110        if($kernel) {
 111                $code .= ".align $align\n";
 112                $code .= "SYM_FUNC_START($name)\n";
 113                $code .= ".L$name:\n";
 114        } else {
 115                $code .= ".globl        $name\n";
 116                $code .= ".type $name,\@function,$nargs\n";
 117                $code .= ".align        $align\n";
 118                $code .= "$name:\n";
 119        }
 120}
 121
 122sub end_function() {
 123        my ($name) = @_;
 124        if($kernel) {
 125                $code .= "SYM_FUNC_END($name)\n";
 126        } else {
 127                $code .= ".size   $name,.-$name\n";
 128        }
 129}
 130
 131$code.=<<___ if $kernel;
 132#include <linux/linkage.h>
 133___
 134
 135if ($avx) {
 136$code.=<<___ if $kernel;
 137.section .rodata
 138___
 139$code.=<<___;
 140.align  64
 141.Lconst:
 142.Lmask24:
 143.long   0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
 144.L129:
 145.long   `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
 146.Lmask26:
 147.long   0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
 148.Lpermd_avx2:
 149.long   2,2,2,3,2,0,2,1
 150.Lpermd_avx512:
 151.long   0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
 152
 153.L2_44_inp_permd:
 154.long   0,1,1,2,2,3,7,7
 155.L2_44_inp_shift:
 156.quad   0,12,24,64
 157.L2_44_mask:
 158.quad   0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
 159.L2_44_shift_rgt:
 160.quad   44,44,42,64
 161.L2_44_shift_lft:
 162.quad   8,8,10,64
 163
 164.align  64
 165.Lx_mask44:
 166.quad   0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 167.quad   0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 168.Lx_mask42:
 169.quad   0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 170.quad   0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 171___
 172}
 173$code.=<<___ if (!$kernel);
 174.asciz  "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 175.align  16
 176___
 177
 178my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
 179my ($mac,$nonce)=($inp,$len);   # *_emit arguments
 180my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
 181my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
 182
 183sub poly1305_iteration {
 184# input:        copy of $r1 in %rax, $h0-$h2, $r0-$r1
 185# output:       $h0-$h2 *= $r0-$r1
 186$code.=<<___;
 187        mulq    $h0                     # h0*r1
 188        mov     %rax,$d2
 189         mov    $r0,%rax
 190        mov     %rdx,$d3
 191
 192        mulq    $h0                     # h0*r0
 193        mov     %rax,$h0                # future $h0
 194         mov    $r0,%rax
 195        mov     %rdx,$d1
 196
 197        mulq    $h1                     # h1*r0
 198        add     %rax,$d2
 199         mov    $s1,%rax
 200        adc     %rdx,$d3
 201
 202        mulq    $h1                     # h1*s1
 203         mov    $h2,$h1                 # borrow $h1
 204        add     %rax,$h0
 205        adc     %rdx,$d1
 206
 207        imulq   $s1,$h1                 # h2*s1
 208        add     $h1,$d2
 209         mov    $d1,$h1
 210        adc     \$0,$d3
 211
 212        imulq   $r0,$h2                 # h2*r0
 213        add     $d2,$h1
 214        mov     \$-4,%rax               # mask value
 215        adc     $h2,$d3
 216
 217        and     $d3,%rax                # last reduction step
 218        mov     $d3,$h2
 219        shr     \$2,$d3
 220        and     \$3,$h2
 221        add     $d3,%rax
 222        add     %rax,$h0
 223        adc     \$0,$h1
 224        adc     \$0,$h2
 225___
 226}
 227
 228########################################################################
 229# Layout of opaque area is following.
 230#
 231#       unsigned __int64 h[3];          # current hash value base 2^64
 232#       unsigned __int64 r[2];          # key value base 2^64
 233
 234$code.=<<___;
 235.text
 236___
 237$code.=<<___ if (!$kernel);
 238.extern OPENSSL_ia32cap_P
 239
 240.globl  poly1305_init_x86_64
 241.hidden poly1305_init_x86_64
 242.globl  poly1305_blocks_x86_64
 243.hidden poly1305_blocks_x86_64
 244.globl  poly1305_emit_x86_64
 245.hidden poly1305_emit_x86_64
 246___
 247&declare_function("poly1305_init_x86_64", 32, 3);
 248$code.=<<___;
 249        xor     %eax,%eax
 250        mov     %rax,0($ctx)            # initialize hash value
 251        mov     %rax,8($ctx)
 252        mov     %rax,16($ctx)
 253
 254        test    $inp,$inp
 255        je      .Lno_key
 256___
 257$code.=<<___ if (!$kernel);
 258        lea     poly1305_blocks_x86_64(%rip),%r10
 259        lea     poly1305_emit_x86_64(%rip),%r11
 260___
 261$code.=<<___    if (!$kernel && $avx);
 262        mov     OPENSSL_ia32cap_P+4(%rip),%r9
 263        lea     poly1305_blocks_avx(%rip),%rax
 264        lea     poly1305_emit_avx(%rip),%rcx
 265        bt      \$`60-32`,%r9           # AVX?
 266        cmovc   %rax,%r10
 267        cmovc   %rcx,%r11
 268___
 269$code.=<<___    if (!$kernel && $avx>1);
 270        lea     poly1305_blocks_avx2(%rip),%rax
 271        bt      \$`5+32`,%r9            # AVX2?
 272        cmovc   %rax,%r10
 273___
 274$code.=<<___    if (!$kernel && $avx>3);
 275        mov     \$`(1<<31|1<<21|1<<16)`,%rax
 276        shr     \$32,%r9
 277        and     %rax,%r9
 278        cmp     %rax,%r9
 279        je      .Linit_base2_44
 280___
 281$code.=<<___;
 282        mov     \$0x0ffffffc0fffffff,%rax
 283        mov     \$0x0ffffffc0ffffffc,%rcx
 284        and     0($inp),%rax
 285        and     8($inp),%rcx
 286        mov     %rax,24($ctx)
 287        mov     %rcx,32($ctx)
 288___
 289$code.=<<___    if (!$kernel && $flavour !~ /elf32/);
 290        mov     %r10,0(%rdx)
 291        mov     %r11,8(%rdx)
 292___
 293$code.=<<___    if (!$kernel && $flavour =~ /elf32/);
 294        mov     %r10d,0(%rdx)
 295        mov     %r11d,4(%rdx)
 296___
 297$code.=<<___;
 298        mov     \$1,%eax
 299.Lno_key:
 300        ret
 301___
 302&end_function("poly1305_init_x86_64");
 303
 304&declare_function("poly1305_blocks_x86_64", 32, 4);
 305$code.=<<___;
 306.cfi_startproc
 307.Lblocks:
 308        shr     \$4,$len
 309        jz      .Lno_data               # too short
 310
 311        push    %rbx
 312.cfi_push       %rbx
 313        push    %r12
 314.cfi_push       %r12
 315        push    %r13
 316.cfi_push       %r13
 317        push    %r14
 318.cfi_push       %r14
 319        push    %r15
 320.cfi_push       %r15
 321        push    $ctx
 322.cfi_push       $ctx
 323.Lblocks_body:
 324
 325        mov     $len,%r15               # reassign $len
 326
 327        mov     24($ctx),$r0            # load r
 328        mov     32($ctx),$s1
 329
 330        mov     0($ctx),$h0             # load hash value
 331        mov     8($ctx),$h1
 332        mov     16($ctx),$h2
 333
 334        mov     $s1,$r1
 335        shr     \$2,$s1
 336        mov     $r1,%rax
 337        add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
 338        jmp     .Loop
 339
 340.align  32
 341.Loop:
 342        add     0($inp),$h0             # accumulate input
 343        adc     8($inp),$h1
 344        lea     16($inp),$inp
 345        adc     $padbit,$h2
 346___
 347
 348        &poly1305_iteration();
 349
 350$code.=<<___;
 351        mov     $r1,%rax
 352        dec     %r15                    # len-=16
 353        jnz     .Loop
 354
 355        mov     0(%rsp),$ctx
 356.cfi_restore    $ctx
 357
 358        mov     $h0,0($ctx)             # store hash value
 359        mov     $h1,8($ctx)
 360        mov     $h2,16($ctx)
 361
 362        mov     8(%rsp),%r15
 363.cfi_restore    %r15
 364        mov     16(%rsp),%r14
 365.cfi_restore    %r14
 366        mov     24(%rsp),%r13
 367.cfi_restore    %r13
 368        mov     32(%rsp),%r12
 369.cfi_restore    %r12
 370        mov     40(%rsp),%rbx
 371.cfi_restore    %rbx
 372        lea     48(%rsp),%rsp
 373.cfi_adjust_cfa_offset  -48
 374.Lno_data:
 375.Lblocks_epilogue:
 376        ret
 377.cfi_endproc
 378___
 379&end_function("poly1305_blocks_x86_64");
 380
 381&declare_function("poly1305_emit_x86_64", 32, 3);
 382$code.=<<___;
 383.Lemit:
 384        mov     0($ctx),%r8     # load hash value
 385        mov     8($ctx),%r9
 386        mov     16($ctx),%r10
 387
 388        mov     %r8,%rax
 389        add     \$5,%r8         # compare to modulus
 390        mov     %r9,%rcx
 391        adc     \$0,%r9
 392        adc     \$0,%r10
 393        shr     \$2,%r10        # did 130-bit value overflow?
 394        cmovnz  %r8,%rax
 395        cmovnz  %r9,%rcx
 396
 397        add     0($nonce),%rax  # accumulate nonce
 398        adc     8($nonce),%rcx
 399        mov     %rax,0($mac)    # write result
 400        mov     %rcx,8($mac)
 401
 402        ret
 403___
 404&end_function("poly1305_emit_x86_64");
 405if ($avx) {
 406
 407########################################################################
 408# Layout of opaque area is following.
 409#
 410#       unsigned __int32 h[5];          # current hash value base 2^26
 411#       unsigned __int32 is_base2_26;
 412#       unsigned __int64 r[2];          # key value base 2^64
 413#       unsigned __int64 pad;
 414#       struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
 415#
 416# where r^n are base 2^26 digits of degrees of multiplier key. There are
 417# 5 digits, but last four are interleaved with multiples of 5, totalling
 418# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
 419
 420my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
 421    map("%xmm$_",(0..15));
 422
 423$code.=<<___;
 424.type   __poly1305_block,\@abi-omnipotent
 425.align  32
 426__poly1305_block:
 427        push $ctx
 428___
 429        &poly1305_iteration();
 430$code.=<<___;
 431        pop $ctx
 432        ret
 433.size   __poly1305_block,.-__poly1305_block
 434
 435.type   __poly1305_init_avx,\@abi-omnipotent
 436.align  32
 437__poly1305_init_avx:
 438        push %rbp
 439        mov %rsp,%rbp
 440        mov     $r0,$h0
 441        mov     $r1,$h1
 442        xor     $h2,$h2
 443
 444        lea     48+64($ctx),$ctx        # size optimization
 445
 446        mov     $r1,%rax
 447        call    __poly1305_block        # r^2
 448
 449        mov     \$0x3ffffff,%eax        # save interleaved r^2 and r base 2^26
 450        mov     \$0x3ffffff,%edx
 451        mov     $h0,$d1
 452        and     $h0#d,%eax
 453        mov     $r0,$d2
 454        and     $r0#d,%edx
 455        mov     %eax,`16*0+0-64`($ctx)
 456        shr     \$26,$d1
 457        mov     %edx,`16*0+4-64`($ctx)
 458        shr     \$26,$d2
 459
 460        mov     \$0x3ffffff,%eax
 461        mov     \$0x3ffffff,%edx
 462        and     $d1#d,%eax
 463        and     $d2#d,%edx
 464        mov     %eax,`16*1+0-64`($ctx)
 465        lea     (%rax,%rax,4),%eax      # *5
 466        mov     %edx,`16*1+4-64`($ctx)
 467        lea     (%rdx,%rdx,4),%edx      # *5
 468        mov     %eax,`16*2+0-64`($ctx)
 469        shr     \$26,$d1
 470        mov     %edx,`16*2+4-64`($ctx)
 471        shr     \$26,$d2
 472
 473        mov     $h1,%rax
 474        mov     $r1,%rdx
 475        shl     \$12,%rax
 476        shl     \$12,%rdx
 477        or      $d1,%rax
 478        or      $d2,%rdx
 479        and     \$0x3ffffff,%eax
 480        and     \$0x3ffffff,%edx
 481        mov     %eax,`16*3+0-64`($ctx)
 482        lea     (%rax,%rax,4),%eax      # *5
 483        mov     %edx,`16*3+4-64`($ctx)
 484        lea     (%rdx,%rdx,4),%edx      # *5
 485        mov     %eax,`16*4+0-64`($ctx)
 486        mov     $h1,$d1
 487        mov     %edx,`16*4+4-64`($ctx)
 488        mov     $r1,$d2
 489
 490        mov     \$0x3ffffff,%eax
 491        mov     \$0x3ffffff,%edx
 492        shr     \$14,$d1
 493        shr     \$14,$d2
 494        and     $d1#d,%eax
 495        and     $d2#d,%edx
 496        mov     %eax,`16*5+0-64`($ctx)
 497        lea     (%rax,%rax,4),%eax      # *5
 498        mov     %edx,`16*5+4-64`($ctx)
 499        lea     (%rdx,%rdx,4),%edx      # *5
 500        mov     %eax,`16*6+0-64`($ctx)
 501        shr     \$26,$d1
 502        mov     %edx,`16*6+4-64`($ctx)
 503        shr     \$26,$d2
 504
 505        mov     $h2,%rax
 506        shl     \$24,%rax
 507        or      %rax,$d1
 508        mov     $d1#d,`16*7+0-64`($ctx)
 509        lea     ($d1,$d1,4),$d1         # *5
 510        mov     $d2#d,`16*7+4-64`($ctx)
 511        lea     ($d2,$d2,4),$d2         # *5
 512        mov     $d1#d,`16*8+0-64`($ctx)
 513        mov     $d2#d,`16*8+4-64`($ctx)
 514
 515        mov     $r1,%rax
 516        call    __poly1305_block        # r^3
 517
 518        mov     \$0x3ffffff,%eax        # save r^3 base 2^26
 519        mov     $h0,$d1
 520        and     $h0#d,%eax
 521        shr     \$26,$d1
 522        mov     %eax,`16*0+12-64`($ctx)
 523
 524        mov     \$0x3ffffff,%edx
 525        and     $d1#d,%edx
 526        mov     %edx,`16*1+12-64`($ctx)
 527        lea     (%rdx,%rdx,4),%edx      # *5
 528        shr     \$26,$d1
 529        mov     %edx,`16*2+12-64`($ctx)
 530
 531        mov     $h1,%rax
 532        shl     \$12,%rax
 533        or      $d1,%rax
 534        and     \$0x3ffffff,%eax
 535        mov     %eax,`16*3+12-64`($ctx)
 536        lea     (%rax,%rax,4),%eax      # *5
 537        mov     $h1,$d1
 538        mov     %eax,`16*4+12-64`($ctx)
 539
 540        mov     \$0x3ffffff,%edx
 541        shr     \$14,$d1
 542        and     $d1#d,%edx
 543        mov     %edx,`16*5+12-64`($ctx)
 544        lea     (%rdx,%rdx,4),%edx      # *5
 545        shr     \$26,$d1
 546        mov     %edx,`16*6+12-64`($ctx)
 547
 548        mov     $h2,%rax
 549        shl     \$24,%rax
 550        or      %rax,$d1
 551        mov     $d1#d,`16*7+12-64`($ctx)
 552        lea     ($d1,$d1,4),$d1         # *5
 553        mov     $d1#d,`16*8+12-64`($ctx)
 554
 555        mov     $r1,%rax
 556        call    __poly1305_block        # r^4
 557
 558        mov     \$0x3ffffff,%eax        # save r^4 base 2^26
 559        mov     $h0,$d1
 560        and     $h0#d,%eax
 561        shr     \$26,$d1
 562        mov     %eax,`16*0+8-64`($ctx)
 563
 564        mov     \$0x3ffffff,%edx
 565        and     $d1#d,%edx
 566        mov     %edx,`16*1+8-64`($ctx)
 567        lea     (%rdx,%rdx,4),%edx      # *5
 568        shr     \$26,$d1
 569        mov     %edx,`16*2+8-64`($ctx)
 570
 571        mov     $h1,%rax
 572        shl     \$12,%rax
 573        or      $d1,%rax
 574        and     \$0x3ffffff,%eax
 575        mov     %eax,`16*3+8-64`($ctx)
 576        lea     (%rax,%rax,4),%eax      # *5
 577        mov     $h1,$d1
 578        mov     %eax,`16*4+8-64`($ctx)
 579
 580        mov     \$0x3ffffff,%edx
 581        shr     \$14,$d1
 582        and     $d1#d,%edx
 583        mov     %edx,`16*5+8-64`($ctx)
 584        lea     (%rdx,%rdx,4),%edx      # *5
 585        shr     \$26,$d1
 586        mov     %edx,`16*6+8-64`($ctx)
 587
 588        mov     $h2,%rax
 589        shl     \$24,%rax
 590        or      %rax,$d1
 591        mov     $d1#d,`16*7+8-64`($ctx)
 592        lea     ($d1,$d1,4),$d1         # *5
 593        mov     $d1#d,`16*8+8-64`($ctx)
 594
 595        lea     -48-64($ctx),$ctx       # size [de-]optimization
 596        pop %rbp
 597        ret
 598.size   __poly1305_init_avx,.-__poly1305_init_avx
 599___
 600
 601&declare_function("poly1305_blocks_avx", 32, 4);
 602$code.=<<___;
 603.cfi_startproc
 604        mov     20($ctx),%r8d           # is_base2_26
 605        cmp     \$128,$len
 606        jae     .Lblocks_avx
 607        test    %r8d,%r8d
 608        jz      .Lblocks
 609
 610.Lblocks_avx:
 611        and     \$-16,$len
 612        jz      .Lno_data_avx
 613
 614        vzeroupper
 615
 616        test    %r8d,%r8d
 617        jz      .Lbase2_64_avx
 618
 619        test    \$31,$len
 620        jz      .Leven_avx
 621
 622        push    %rbp
 623.cfi_push       %rbp
 624        mov     %rsp,%rbp
 625        push    %rbx
 626.cfi_push       %rbx
 627        push    %r12
 628.cfi_push       %r12
 629        push    %r13
 630.cfi_push       %r13
 631        push    %r14
 632.cfi_push       %r14
 633        push    %r15
 634.cfi_push       %r15
 635.Lblocks_avx_body:
 636
 637        mov     $len,%r15               # reassign $len
 638
 639        mov     0($ctx),$d1             # load hash value
 640        mov     8($ctx),$d2
 641        mov     16($ctx),$h2#d
 642
 643        mov     24($ctx),$r0            # load r
 644        mov     32($ctx),$s1
 645
 646        ################################# base 2^26 -> base 2^64
 647        mov     $d1#d,$h0#d
 648        and     \$`-1*(1<<31)`,$d1
 649        mov     $d2,$r1                 # borrow $r1
 650        mov     $d2#d,$h1#d
 651        and     \$`-1*(1<<31)`,$d2
 652
 653        shr     \$6,$d1
 654        shl     \$52,$r1
 655        add     $d1,$h0
 656        shr     \$12,$h1
 657        shr     \$18,$d2
 658        add     $r1,$h0
 659        adc     $d2,$h1
 660
 661        mov     $h2,$d1
 662        shl     \$40,$d1
 663        shr     \$24,$h2
 664        add     $d1,$h1
 665        adc     \$0,$h2                 # can be partially reduced...
 666
 667        mov     \$-4,$d2                # ... so reduce
 668        mov     $h2,$d1
 669        and     $h2,$d2
 670        shr     \$2,$d1
 671        and     \$3,$h2
 672        add     $d2,$d1                 # =*5
 673        add     $d1,$h0
 674        adc     \$0,$h1
 675        adc     \$0,$h2
 676
 677        mov     $s1,$r1
 678        mov     $s1,%rax
 679        shr     \$2,$s1
 680        add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
 681
 682        add     0($inp),$h0             # accumulate input
 683        adc     8($inp),$h1
 684        lea     16($inp),$inp
 685        adc     $padbit,$h2
 686
 687        call    __poly1305_block
 688
 689        test    $padbit,$padbit         # if $padbit is zero,
 690        jz      .Lstore_base2_64_avx    # store hash in base 2^64 format
 691
 692        ################################# base 2^64 -> base 2^26
 693        mov     $h0,%rax
 694        mov     $h0,%rdx
 695        shr     \$52,$h0
 696        mov     $h1,$r0
 697        mov     $h1,$r1
 698        shr     \$26,%rdx
 699        and     \$0x3ffffff,%rax        # h[0]
 700        shl     \$12,$r0
 701        and     \$0x3ffffff,%rdx        # h[1]
 702        shr     \$14,$h1
 703        or      $r0,$h0
 704        shl     \$24,$h2
 705        and     \$0x3ffffff,$h0         # h[2]
 706        shr     \$40,$r1
 707        and     \$0x3ffffff,$h1         # h[3]
 708        or      $r1,$h2                 # h[4]
 709
 710        sub     \$16,%r15
 711        jz      .Lstore_base2_26_avx
 712
 713        vmovd   %rax#d,$H0
 714        vmovd   %rdx#d,$H1
 715        vmovd   $h0#d,$H2
 716        vmovd   $h1#d,$H3
 717        vmovd   $h2#d,$H4
 718        jmp     .Lproceed_avx
 719
 720.align  32
 721.Lstore_base2_64_avx:
 722        mov     $h0,0($ctx)
 723        mov     $h1,8($ctx)
 724        mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
 725        jmp     .Ldone_avx
 726
 727.align  16
 728.Lstore_base2_26_avx:
 729        mov     %rax#d,0($ctx)          # store hash value base 2^26
 730        mov     %rdx#d,4($ctx)
 731        mov     $h0#d,8($ctx)
 732        mov     $h1#d,12($ctx)
 733        mov     $h2#d,16($ctx)
 734.align  16
 735.Ldone_avx:
 736        pop             %r15
 737.cfi_restore    %r15
 738        pop             %r14
 739.cfi_restore    %r14
 740        pop             %r13
 741.cfi_restore    %r13
 742        pop             %r12
 743.cfi_restore    %r12
 744        pop             %rbx
 745.cfi_restore    %rbx
 746        pop             %rbp
 747.cfi_restore    %rbp
 748.Lno_data_avx:
 749.Lblocks_avx_epilogue:
 750        ret
 751.cfi_endproc
 752
 753.align  32
 754.Lbase2_64_avx:
 755.cfi_startproc
 756        push    %rbp
 757.cfi_push       %rbp
 758        mov     %rsp,%rbp
 759        push    %rbx
 760.cfi_push       %rbx
 761        push    %r12
 762.cfi_push       %r12
 763        push    %r13
 764.cfi_push       %r13
 765        push    %r14
 766.cfi_push       %r14
 767        push    %r15
 768.cfi_push       %r15
 769.Lbase2_64_avx_body:
 770
 771        mov     $len,%r15               # reassign $len
 772
 773        mov     24($ctx),$r0            # load r
 774        mov     32($ctx),$s1
 775
 776        mov     0($ctx),$h0             # load hash value
 777        mov     8($ctx),$h1
 778        mov     16($ctx),$h2#d
 779
 780        mov     $s1,$r1
 781        mov     $s1,%rax
 782        shr     \$2,$s1
 783        add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
 784
 785        test    \$31,$len
 786        jz      .Linit_avx
 787
 788        add     0($inp),$h0             # accumulate input
 789        adc     8($inp),$h1
 790        lea     16($inp),$inp
 791        adc     $padbit,$h2
 792        sub     \$16,%r15
 793
 794        call    __poly1305_block
 795
 796.Linit_avx:
 797        ################################# base 2^64 -> base 2^26
 798        mov     $h0,%rax
 799        mov     $h0,%rdx
 800        shr     \$52,$h0
 801        mov     $h1,$d1
 802        mov     $h1,$d2
 803        shr     \$26,%rdx
 804        and     \$0x3ffffff,%rax        # h[0]
 805        shl     \$12,$d1
 806        and     \$0x3ffffff,%rdx        # h[1]
 807        shr     \$14,$h1
 808        or      $d1,$h0
 809        shl     \$24,$h2
 810        and     \$0x3ffffff,$h0         # h[2]
 811        shr     \$40,$d2
 812        and     \$0x3ffffff,$h1         # h[3]
 813        or      $d2,$h2                 # h[4]
 814
 815        vmovd   %rax#d,$H0
 816        vmovd   %rdx#d,$H1
 817        vmovd   $h0#d,$H2
 818        vmovd   $h1#d,$H3
 819        vmovd   $h2#d,$H4
 820        movl    \$1,20($ctx)            # set is_base2_26
 821
 822        call    __poly1305_init_avx
 823
 824.Lproceed_avx:
 825        mov     %r15,$len
 826        pop             %r15
 827.cfi_restore    %r15
 828        pop             %r14
 829.cfi_restore    %r14
 830        pop             %r13
 831.cfi_restore    %r13
 832        pop             %r12
 833.cfi_restore    %r12
 834        pop             %rbx
 835.cfi_restore    %rbx
 836        pop             %rbp
 837.cfi_restore    %rbp
 838.Lbase2_64_avx_epilogue:
 839        jmp     .Ldo_avx
 840.cfi_endproc
 841
 842.align  32
 843.Leven_avx:
 844.cfi_startproc
 845        vmovd           4*0($ctx),$H0           # load hash value
 846        vmovd           4*1($ctx),$H1
 847        vmovd           4*2($ctx),$H2
 848        vmovd           4*3($ctx),$H3
 849        vmovd           4*4($ctx),$H4
 850
 851.Ldo_avx:
 852___
 853$code.=<<___    if (!$win64);
 854        lea             8(%rsp),%r10
 855.cfi_def_cfa_register   %r10
 856        and             \$-32,%rsp
 857        sub             \$-8,%rsp
 858        lea             -0x58(%rsp),%r11
 859        sub             \$0x178,%rsp
 860___
 861$code.=<<___    if ($win64);
 862        lea             -0xf8(%rsp),%r11
 863        sub             \$0x218,%rsp
 864        vmovdqa         %xmm6,0x50(%r11)
 865        vmovdqa         %xmm7,0x60(%r11)
 866        vmovdqa         %xmm8,0x70(%r11)
 867        vmovdqa         %xmm9,0x80(%r11)
 868        vmovdqa         %xmm10,0x90(%r11)
 869        vmovdqa         %xmm11,0xa0(%r11)
 870        vmovdqa         %xmm12,0xb0(%r11)
 871        vmovdqa         %xmm13,0xc0(%r11)
 872        vmovdqa         %xmm14,0xd0(%r11)
 873        vmovdqa         %xmm15,0xe0(%r11)
 874.Ldo_avx_body:
 875___
 876$code.=<<___;
 877        sub             \$64,$len
 878        lea             -32($inp),%rax
 879        cmovc           %rax,$inp
 880
 881        vmovdqu         `16*3`($ctx),$D4        # preload r0^2
 882        lea             `16*3+64`($ctx),$ctx    # size optimization
 883        lea             .Lconst(%rip),%rcx
 884
 885        ################################################################
 886        # load input
 887        vmovdqu         16*2($inp),$T0
 888        vmovdqu         16*3($inp),$T1
 889        vmovdqa         64(%rcx),$MASK          # .Lmask26
 890
 891        vpsrldq         \$6,$T0,$T2             # splat input
 892        vpsrldq         \$6,$T1,$T3
 893        vpunpckhqdq     $T1,$T0,$T4             # 4
 894        vpunpcklqdq     $T1,$T0,$T0             # 0:1
 895        vpunpcklqdq     $T3,$T2,$T3             # 2:3
 896
 897        vpsrlq          \$40,$T4,$T4            # 4
 898        vpsrlq          \$26,$T0,$T1
 899        vpand           $MASK,$T0,$T0           # 0
 900        vpsrlq          \$4,$T3,$T2
 901        vpand           $MASK,$T1,$T1           # 1
 902        vpsrlq          \$30,$T3,$T3
 903        vpand           $MASK,$T2,$T2           # 2
 904        vpand           $MASK,$T3,$T3           # 3
 905        vpor            32(%rcx),$T4,$T4        # padbit, yes, always
 906
 907        jbe             .Lskip_loop_avx
 908
 909        # expand and copy pre-calculated table to stack
 910        vmovdqu         `16*1-64`($ctx),$D1
 911        vmovdqu         `16*2-64`($ctx),$D2
 912        vpshufd         \$0xEE,$D4,$D3          # 34xx -> 3434
 913        vpshufd         \$0x44,$D4,$D0          # xx12 -> 1212
 914        vmovdqa         $D3,-0x90(%r11)
 915        vmovdqa         $D0,0x00(%rsp)
 916        vpshufd         \$0xEE,$D1,$D4
 917        vmovdqu         `16*3-64`($ctx),$D0
 918        vpshufd         \$0x44,$D1,$D1
 919        vmovdqa         $D4,-0x80(%r11)
 920        vmovdqa         $D1,0x10(%rsp)
 921        vpshufd         \$0xEE,$D2,$D3
 922        vmovdqu         `16*4-64`($ctx),$D1
 923        vpshufd         \$0x44,$D2,$D2
 924        vmovdqa         $D3,-0x70(%r11)
 925        vmovdqa         $D2,0x20(%rsp)
 926        vpshufd         \$0xEE,$D0,$D4
 927        vmovdqu         `16*5-64`($ctx),$D2
 928        vpshufd         \$0x44,$D0,$D0
 929        vmovdqa         $D4,-0x60(%r11)
 930        vmovdqa         $D0,0x30(%rsp)
 931        vpshufd         \$0xEE,$D1,$D3
 932        vmovdqu         `16*6-64`($ctx),$D0
 933        vpshufd         \$0x44,$D1,$D1
 934        vmovdqa         $D3,-0x50(%r11)
 935        vmovdqa         $D1,0x40(%rsp)
 936        vpshufd         \$0xEE,$D2,$D4
 937        vmovdqu         `16*7-64`($ctx),$D1
 938        vpshufd         \$0x44,$D2,$D2
 939        vmovdqa         $D4,-0x40(%r11)
 940        vmovdqa         $D2,0x50(%rsp)
 941        vpshufd         \$0xEE,$D0,$D3
 942        vmovdqu         `16*8-64`($ctx),$D2
 943        vpshufd         \$0x44,$D0,$D0
 944        vmovdqa         $D3,-0x30(%r11)
 945        vmovdqa         $D0,0x60(%rsp)
 946        vpshufd         \$0xEE,$D1,$D4
 947        vpshufd         \$0x44,$D1,$D1
 948        vmovdqa         $D4,-0x20(%r11)
 949        vmovdqa         $D1,0x70(%rsp)
 950        vpshufd         \$0xEE,$D2,$D3
 951         vmovdqa        0x00(%rsp),$D4          # preload r0^2
 952        vpshufd         \$0x44,$D2,$D2
 953        vmovdqa         $D3,-0x10(%r11)
 954        vmovdqa         $D2,0x80(%rsp)
 955
 956        jmp             .Loop_avx
 957
 958.align  32
 959.Loop_avx:
 960        ################################################################
 961        # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
 962        # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
 963        #   \___________________/
 964        # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
 965        # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
 966        #   \___________________/ \____________________/
 967        #
 968        # Note that we start with inp[2:3]*r^2. This is because it
 969        # doesn't depend on reduction in previous iteration.
 970        ################################################################
 971        # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
 972        # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
 973        # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
 974        # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
 975        # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
 976        #
 977        # though note that $Tx and $Hx are "reversed" in this section,
 978        # and $D4 is preloaded with r0^2...
 979
 980        vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
 981        vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
 982          vmovdqa       $H2,0x20(%r11)                          # offload hash
 983        vpmuludq        $T2,$D4,$D2             # d3 = h2*r0
 984         vmovdqa        0x10(%rsp),$H2          # r1^2
 985        vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
 986        vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
 987
 988          vmovdqa       $H0,0x00(%r11)                          #
 989        vpmuludq        0x20(%rsp),$T4,$H0      # h4*s1
 990          vmovdqa       $H1,0x10(%r11)                          #
 991        vpmuludq        $T3,$H2,$H1             # h3*r1
 992        vpaddq          $H0,$D0,$D0             # d0 += h4*s1
 993        vpaddq          $H1,$D4,$D4             # d4 += h3*r1
 994          vmovdqa       $H3,0x30(%r11)                          #
 995        vpmuludq        $T2,$H2,$H0             # h2*r1
 996        vpmuludq        $T1,$H2,$H1             # h1*r1
 997        vpaddq          $H0,$D3,$D3             # d3 += h2*r1
 998         vmovdqa        0x30(%rsp),$H3          # r2^2
 999        vpaddq          $H1,$D2,$D2             # d2 += h1*r1
1000          vmovdqa       $H4,0x40(%r11)                          #
1001        vpmuludq        $T0,$H2,$H2             # h0*r1
1002         vpmuludq       $T2,$H3,$H0             # h2*r2
1003        vpaddq          $H2,$D1,$D1             # d1 += h0*r1
1004
1005         vmovdqa        0x40(%rsp),$H4          # s2^2
1006        vpaddq          $H0,$D4,$D4             # d4 += h2*r2
1007        vpmuludq        $T1,$H3,$H1             # h1*r2
1008        vpmuludq        $T0,$H3,$H3             # h0*r2
1009        vpaddq          $H1,$D3,$D3             # d3 += h1*r2
1010         vmovdqa        0x50(%rsp),$H2          # r3^2
1011        vpaddq          $H3,$D2,$D2             # d2 += h0*r2
1012        vpmuludq        $T4,$H4,$H0             # h4*s2
1013        vpmuludq        $T3,$H4,$H4             # h3*s2
1014        vpaddq          $H0,$D1,$D1             # d1 += h4*s2
1015         vmovdqa        0x60(%rsp),$H3          # s3^2
1016        vpaddq          $H4,$D0,$D0             # d0 += h3*s2
1017
1018         vmovdqa        0x80(%rsp),$H4          # s4^2
1019        vpmuludq        $T1,$H2,$H1             # h1*r3
1020        vpmuludq        $T0,$H2,$H2             # h0*r3
1021        vpaddq          $H1,$D4,$D4             # d4 += h1*r3
1022        vpaddq          $H2,$D3,$D3             # d3 += h0*r3
1023        vpmuludq        $T4,$H3,$H0             # h4*s3
1024        vpmuludq        $T3,$H3,$H1             # h3*s3
1025        vpaddq          $H0,$D2,$D2             # d2 += h4*s3
1026         vmovdqu        16*0($inp),$H0                          # load input
1027        vpaddq          $H1,$D1,$D1             # d1 += h3*s3
1028        vpmuludq        $T2,$H3,$H3             # h2*s3
1029         vpmuludq       $T2,$H4,$T2             # h2*s4
1030        vpaddq          $H3,$D0,$D0             # d0 += h2*s3
1031
1032         vmovdqu        16*1($inp),$H1                          #
1033        vpaddq          $T2,$D1,$D1             # d1 += h2*s4
1034        vpmuludq        $T3,$H4,$T3             # h3*s4
1035        vpmuludq        $T4,$H4,$T4             # h4*s4
1036         vpsrldq        \$6,$H0,$H2                             # splat input
1037        vpaddq          $T3,$D2,$D2             # d2 += h3*s4
1038        vpaddq          $T4,$D3,$D3             # d3 += h4*s4
1039         vpsrldq        \$6,$H1,$H3                             #
1040        vpmuludq        0x70(%rsp),$T0,$T4      # h0*r4
1041        vpmuludq        $T1,$H4,$T0             # h1*s4
1042         vpunpckhqdq    $H1,$H0,$H4             # 4
1043        vpaddq          $T4,$D4,$D4             # d4 += h0*r4
1044         vmovdqa        -0x90(%r11),$T4         # r0^4
1045        vpaddq          $T0,$D0,$D0             # d0 += h1*s4
1046
1047        vpunpcklqdq     $H1,$H0,$H0             # 0:1
1048        vpunpcklqdq     $H3,$H2,$H3             # 2:3
1049
1050        #vpsrlq         \$40,$H4,$H4            # 4
1051        vpsrldq         \$`40/8`,$H4,$H4        # 4
1052        vpsrlq          \$26,$H0,$H1
1053        vpand           $MASK,$H0,$H0           # 0
1054        vpsrlq          \$4,$H3,$H2
1055        vpand           $MASK,$H1,$H1           # 1
1056        vpand           0(%rcx),$H4,$H4         # .Lmask24
1057        vpsrlq          \$30,$H3,$H3
1058        vpand           $MASK,$H2,$H2           # 2
1059        vpand           $MASK,$H3,$H3           # 3
1060        vpor            32(%rcx),$H4,$H4        # padbit, yes, always
1061
1062        vpaddq          0x00(%r11),$H0,$H0      # add hash value
1063        vpaddq          0x10(%r11),$H1,$H1
1064        vpaddq          0x20(%r11),$H2,$H2
1065        vpaddq          0x30(%r11),$H3,$H3
1066        vpaddq          0x40(%r11),$H4,$H4
1067
1068        lea             16*2($inp),%rax
1069        lea             16*4($inp),$inp
1070        sub             \$64,$len
1071        cmovc           %rax,$inp
1072
1073        ################################################################
1074        # Now we accumulate (inp[0:1]+hash)*r^4
1075        ################################################################
1076        # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1077        # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1078        # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1079        # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1080        # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1081
1082        vpmuludq        $H0,$T4,$T0             # h0*r0
1083        vpmuludq        $H1,$T4,$T1             # h1*r0
1084        vpaddq          $T0,$D0,$D0
1085        vpaddq          $T1,$D1,$D1
1086         vmovdqa        -0x80(%r11),$T2         # r1^4
1087        vpmuludq        $H2,$T4,$T0             # h2*r0
1088        vpmuludq        $H3,$T4,$T1             # h3*r0
1089        vpaddq          $T0,$D2,$D2
1090        vpaddq          $T1,$D3,$D3
1091        vpmuludq        $H4,$T4,$T4             # h4*r0
1092         vpmuludq       -0x70(%r11),$H4,$T0     # h4*s1
1093        vpaddq          $T4,$D4,$D4
1094
1095        vpaddq          $T0,$D0,$D0             # d0 += h4*s1
1096        vpmuludq        $H2,$T2,$T1             # h2*r1
1097        vpmuludq        $H3,$T2,$T0             # h3*r1
1098        vpaddq          $T1,$D3,$D3             # d3 += h2*r1
1099         vmovdqa        -0x60(%r11),$T3         # r2^4
1100        vpaddq          $T0,$D4,$D4             # d4 += h3*r1
1101        vpmuludq        $H1,$T2,$T1             # h1*r1
1102        vpmuludq        $H0,$T2,$T2             # h0*r1
1103        vpaddq          $T1,$D2,$D2             # d2 += h1*r1
1104        vpaddq          $T2,$D1,$D1             # d1 += h0*r1
1105
1106         vmovdqa        -0x50(%r11),$T4         # s2^4
1107        vpmuludq        $H2,$T3,$T0             # h2*r2
1108        vpmuludq        $H1,$T3,$T1             # h1*r2
1109        vpaddq          $T0,$D4,$D4             # d4 += h2*r2
1110        vpaddq          $T1,$D3,$D3             # d3 += h1*r2
1111         vmovdqa        -0x40(%r11),$T2         # r3^4
1112        vpmuludq        $H0,$T3,$T3             # h0*r2
1113        vpmuludq        $H4,$T4,$T0             # h4*s2
1114        vpaddq          $T3,$D2,$D2             # d2 += h0*r2
1115        vpaddq          $T0,$D1,$D1             # d1 += h4*s2
1116         vmovdqa        -0x30(%r11),$T3         # s3^4
1117        vpmuludq        $H3,$T4,$T4             # h3*s2
1118         vpmuludq       $H1,$T2,$T1             # h1*r3
1119        vpaddq          $T4,$D0,$D0             # d0 += h3*s2
1120
1121         vmovdqa        -0x10(%r11),$T4         # s4^4
1122        vpaddq          $T1,$D4,$D4             # d4 += h1*r3
1123        vpmuludq        $H0,$T2,$T2             # h0*r3
1124        vpmuludq        $H4,$T3,$T0             # h4*s3
1125        vpaddq          $T2,$D3,$D3             # d3 += h0*r3
1126        vpaddq          $T0,$D2,$D2             # d2 += h4*s3
1127         vmovdqu        16*2($inp),$T0                          # load input
1128        vpmuludq        $H3,$T3,$T2             # h3*s3
1129        vpmuludq        $H2,$T3,$T3             # h2*s3
1130        vpaddq          $T2,$D1,$D1             # d1 += h3*s3
1131         vmovdqu        16*3($inp),$T1                          #
1132        vpaddq          $T3,$D0,$D0             # d0 += h2*s3
1133
1134        vpmuludq        $H2,$T4,$H2             # h2*s4
1135        vpmuludq        $H3,$T4,$H3             # h3*s4
1136         vpsrldq        \$6,$T0,$T2                             # splat input
1137        vpaddq          $H2,$D1,$D1             # d1 += h2*s4
1138        vpmuludq        $H4,$T4,$H4             # h4*s4
1139         vpsrldq        \$6,$T1,$T3                             #
1140        vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*s4
1141        vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*s4
1142        vpmuludq        -0x20(%r11),$H0,$H4     # h0*r4
1143        vpmuludq        $H1,$T4,$H0
1144         vpunpckhqdq    $T1,$T0,$T4             # 4
1145        vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
1146        vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
1147
1148        vpunpcklqdq     $T1,$T0,$T0             # 0:1
1149        vpunpcklqdq     $T3,$T2,$T3             # 2:3
1150
1151        #vpsrlq         \$40,$T4,$T4            # 4
1152        vpsrldq         \$`40/8`,$T4,$T4        # 4
1153        vpsrlq          \$26,$T0,$T1
1154         vmovdqa        0x00(%rsp),$D4          # preload r0^2
1155        vpand           $MASK,$T0,$T0           # 0
1156        vpsrlq          \$4,$T3,$T2
1157        vpand           $MASK,$T1,$T1           # 1
1158        vpand           0(%rcx),$T4,$T4         # .Lmask24
1159        vpsrlq          \$30,$T3,$T3
1160        vpand           $MASK,$T2,$T2           # 2
1161        vpand           $MASK,$T3,$T3           # 3
1162        vpor            32(%rcx),$T4,$T4        # padbit, yes, always
1163
1164        ################################################################
1165        # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1166        # and P. Schwabe
1167
1168        vpsrlq          \$26,$H3,$D3
1169        vpand           $MASK,$H3,$H3
1170        vpaddq          $D3,$H4,$H4             # h3 -> h4
1171
1172        vpsrlq          \$26,$H0,$D0
1173        vpand           $MASK,$H0,$H0
1174        vpaddq          $D0,$D1,$H1             # h0 -> h1
1175
1176        vpsrlq          \$26,$H4,$D0
1177        vpand           $MASK,$H4,$H4
1178
1179        vpsrlq          \$26,$H1,$D1
1180        vpand           $MASK,$H1,$H1
1181        vpaddq          $D1,$H2,$H2             # h1 -> h2
1182
1183        vpaddq          $D0,$H0,$H0
1184        vpsllq          \$2,$D0,$D0
1185        vpaddq          $D0,$H0,$H0             # h4 -> h0
1186
1187        vpsrlq          \$26,$H2,$D2
1188        vpand           $MASK,$H2,$H2
1189        vpaddq          $D2,$H3,$H3             # h2 -> h3
1190
1191        vpsrlq          \$26,$H0,$D0
1192        vpand           $MASK,$H0,$H0
1193        vpaddq          $D0,$H1,$H1             # h0 -> h1
1194
1195        vpsrlq          \$26,$H3,$D3
1196        vpand           $MASK,$H3,$H3
1197        vpaddq          $D3,$H4,$H4             # h3 -> h4
1198
1199        ja              .Loop_avx
1200
1201.Lskip_loop_avx:
1202        ################################################################
1203        # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1204
1205        vpshufd         \$0x10,$D4,$D4          # r0^n, xx12 -> x1x2
1206        add             \$32,$len
1207        jnz             .Long_tail_avx
1208
1209        vpaddq          $H2,$T2,$T2
1210        vpaddq          $H0,$T0,$T0
1211        vpaddq          $H1,$T1,$T1
1212        vpaddq          $H3,$T3,$T3
1213        vpaddq          $H4,$T4,$T4
1214
1215.Long_tail_avx:
1216        vmovdqa         $H2,0x20(%r11)
1217        vmovdqa         $H0,0x00(%r11)
1218        vmovdqa         $H1,0x10(%r11)
1219        vmovdqa         $H3,0x30(%r11)
1220        vmovdqa         $H4,0x40(%r11)
1221
1222        # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1223        # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1224        # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1225        # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1226        # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1227
1228        vpmuludq        $T2,$D4,$D2             # d2 = h2*r0
1229        vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
1230         vpshufd        \$0x10,`16*1-64`($ctx),$H2              # r1^n
1231        vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
1232        vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
1233        vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
1234
1235        vpmuludq        $T3,$H2,$H0             # h3*r1
1236        vpaddq          $H0,$D4,$D4             # d4 += h3*r1
1237         vpshufd        \$0x10,`16*2-64`($ctx),$H3              # s1^n
1238        vpmuludq        $T2,$H2,$H1             # h2*r1
1239        vpaddq          $H1,$D3,$D3             # d3 += h2*r1
1240         vpshufd        \$0x10,`16*3-64`($ctx),$H4              # r2^n
1241        vpmuludq        $T1,$H2,$H0             # h1*r1
1242        vpaddq          $H0,$D2,$D2             # d2 += h1*r1
1243        vpmuludq        $T0,$H2,$H2             # h0*r1
1244        vpaddq          $H2,$D1,$D1             # d1 += h0*r1
1245        vpmuludq        $T4,$H3,$H3             # h4*s1
1246        vpaddq          $H3,$D0,$D0             # d0 += h4*s1
1247
1248         vpshufd        \$0x10,`16*4-64`($ctx),$H2              # s2^n
1249        vpmuludq        $T2,$H4,$H1             # h2*r2
1250        vpaddq          $H1,$D4,$D4             # d4 += h2*r2
1251        vpmuludq        $T1,$H4,$H0             # h1*r2
1252        vpaddq          $H0,$D3,$D3             # d3 += h1*r2
1253         vpshufd        \$0x10,`16*5-64`($ctx),$H3              # r3^n
1254        vpmuludq        $T0,$H4,$H4             # h0*r2
1255        vpaddq          $H4,$D2,$D2             # d2 += h0*r2
1256        vpmuludq        $T4,$H2,$H1             # h4*s2
1257        vpaddq          $H1,$D1,$D1             # d1 += h4*s2
1258         vpshufd        \$0x10,`16*6-64`($ctx),$H4              # s3^n
1259        vpmuludq        $T3,$H2,$H2             # h3*s2
1260        vpaddq          $H2,$D0,$D0             # d0 += h3*s2
1261
1262        vpmuludq        $T1,$H3,$H0             # h1*r3
1263        vpaddq          $H0,$D4,$D4             # d4 += h1*r3
1264        vpmuludq        $T0,$H3,$H3             # h0*r3
1265        vpaddq          $H3,$D3,$D3             # d3 += h0*r3
1266         vpshufd        \$0x10,`16*7-64`($ctx),$H2              # r4^n
1267        vpmuludq        $T4,$H4,$H1             # h4*s3
1268        vpaddq          $H1,$D2,$D2             # d2 += h4*s3
1269         vpshufd        \$0x10,`16*8-64`($ctx),$H3              # s4^n
1270        vpmuludq        $T3,$H4,$H0             # h3*s3
1271        vpaddq          $H0,$D1,$D1             # d1 += h3*s3
1272        vpmuludq        $T2,$H4,$H4             # h2*s3
1273        vpaddq          $H4,$D0,$D0             # d0 += h2*s3
1274
1275        vpmuludq        $T0,$H2,$H2             # h0*r4
1276        vpaddq          $H2,$D4,$D4             # h4 = d4 + h0*r4
1277        vpmuludq        $T4,$H3,$H1             # h4*s4
1278        vpaddq          $H1,$D3,$D3             # h3 = d3 + h4*s4
1279        vpmuludq        $T3,$H3,$H0             # h3*s4
1280        vpaddq          $H0,$D2,$D2             # h2 = d2 + h3*s4
1281        vpmuludq        $T2,$H3,$H1             # h2*s4
1282        vpaddq          $H1,$D1,$D1             # h1 = d1 + h2*s4
1283        vpmuludq        $T1,$H3,$H3             # h1*s4
1284        vpaddq          $H3,$D0,$D0             # h0 = d0 + h1*s4
1285
1286        jz              .Lshort_tail_avx
1287
1288        vmovdqu         16*0($inp),$H0          # load input
1289        vmovdqu         16*1($inp),$H1
1290
1291        vpsrldq         \$6,$H0,$H2             # splat input
1292        vpsrldq         \$6,$H1,$H3
1293        vpunpckhqdq     $H1,$H0,$H4             # 4
1294        vpunpcklqdq     $H1,$H0,$H0             # 0:1
1295        vpunpcklqdq     $H3,$H2,$H3             # 2:3
1296
1297        vpsrlq          \$40,$H4,$H4            # 4
1298        vpsrlq          \$26,$H0,$H1
1299        vpand           $MASK,$H0,$H0           # 0
1300        vpsrlq          \$4,$H3,$H2
1301        vpand           $MASK,$H1,$H1           # 1
1302        vpsrlq          \$30,$H3,$H3
1303        vpand           $MASK,$H2,$H2           # 2
1304        vpand           $MASK,$H3,$H3           # 3
1305        vpor            32(%rcx),$H4,$H4        # padbit, yes, always
1306
1307        vpshufd         \$0x32,`16*0-64`($ctx),$T4      # r0^n, 34xx -> x3x4
1308        vpaddq          0x00(%r11),$H0,$H0
1309        vpaddq          0x10(%r11),$H1,$H1
1310        vpaddq          0x20(%r11),$H2,$H2
1311        vpaddq          0x30(%r11),$H3,$H3
1312        vpaddq          0x40(%r11),$H4,$H4
1313
1314        ################################################################
1315        # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1316
1317        vpmuludq        $H0,$T4,$T0             # h0*r0
1318        vpaddq          $T0,$D0,$D0             # d0 += h0*r0
1319        vpmuludq        $H1,$T4,$T1             # h1*r0
1320        vpaddq          $T1,$D1,$D1             # d1 += h1*r0
1321        vpmuludq        $H2,$T4,$T0             # h2*r0
1322        vpaddq          $T0,$D2,$D2             # d2 += h2*r0
1323         vpshufd        \$0x32,`16*1-64`($ctx),$T2              # r1^n
1324        vpmuludq        $H3,$T4,$T1             # h3*r0
1325        vpaddq          $T1,$D3,$D3             # d3 += h3*r0
1326        vpmuludq        $H4,$T4,$T4             # h4*r0
1327        vpaddq          $T4,$D4,$D4             # d4 += h4*r0
1328
1329        vpmuludq        $H3,$T2,$T0             # h3*r1
1330        vpaddq          $T0,$D4,$D4             # d4 += h3*r1
1331         vpshufd        \$0x32,`16*2-64`($ctx),$T3              # s1
1332        vpmuludq        $H2,$T2,$T1             # h2*r1
1333        vpaddq          $T1,$D3,$D3             # d3 += h2*r1
1334         vpshufd        \$0x32,`16*3-64`($ctx),$T4              # r2
1335        vpmuludq        $H1,$T2,$T0             # h1*r1
1336        vpaddq          $T0,$D2,$D2             # d2 += h1*r1
1337        vpmuludq        $H0,$T2,$T2             # h0*r1
1338        vpaddq          $T2,$D1,$D1             # d1 += h0*r1
1339        vpmuludq        $H4,$T3,$T3             # h4*s1
1340        vpaddq          $T3,$D0,$D0             # d0 += h4*s1
1341
1342         vpshufd        \$0x32,`16*4-64`($ctx),$T2              # s2
1343        vpmuludq        $H2,$T4,$T1             # h2*r2
1344        vpaddq          $T1,$D4,$D4             # d4 += h2*r2
1345        vpmuludq        $H1,$T4,$T0             # h1*r2
1346        vpaddq          $T0,$D3,$D3             # d3 += h1*r2
1347         vpshufd        \$0x32,`16*5-64`($ctx),$T3              # r3
1348        vpmuludq        $H0,$T4,$T4             # h0*r2
1349        vpaddq          $T4,$D2,$D2             # d2 += h0*r2
1350        vpmuludq        $H4,$T2,$T1             # h4*s2
1351        vpaddq          $T1,$D1,$D1             # d1 += h4*s2
1352         vpshufd        \$0x32,`16*6-64`($ctx),$T4              # s3
1353        vpmuludq        $H3,$T2,$T2             # h3*s2
1354        vpaddq          $T2,$D0,$D0             # d0 += h3*s2
1355
1356        vpmuludq        $H1,$T3,$T0             # h1*r3
1357        vpaddq          $T0,$D4,$D4             # d4 += h1*r3
1358        vpmuludq        $H0,$T3,$T3             # h0*r3
1359        vpaddq          $T3,$D3,$D3             # d3 += h0*r3
1360         vpshufd        \$0x32,`16*7-64`($ctx),$T2              # r4
1361        vpmuludq        $H4,$T4,$T1             # h4*s3
1362        vpaddq          $T1,$D2,$D2             # d2 += h4*s3
1363         vpshufd        \$0x32,`16*8-64`($ctx),$T3              # s4
1364        vpmuludq        $H3,$T4,$T0             # h3*s3
1365        vpaddq          $T0,$D1,$D1             # d1 += h3*s3
1366        vpmuludq        $H2,$T4,$T4             # h2*s3
1367        vpaddq          $T4,$D0,$D0             # d0 += h2*s3
1368
1369        vpmuludq        $H0,$T2,$T2             # h0*r4
1370        vpaddq          $T2,$D4,$D4             # d4 += h0*r4
1371        vpmuludq        $H4,$T3,$T1             # h4*s4
1372        vpaddq          $T1,$D3,$D3             # d3 += h4*s4
1373        vpmuludq        $H3,$T3,$T0             # h3*s4
1374        vpaddq          $T0,$D2,$D2             # d2 += h3*s4
1375        vpmuludq        $H2,$T3,$T1             # h2*s4
1376        vpaddq          $T1,$D1,$D1             # d1 += h2*s4
1377        vpmuludq        $H1,$T3,$T3             # h1*s4
1378        vpaddq          $T3,$D0,$D0             # d0 += h1*s4
1379
1380.Lshort_tail_avx:
1381        ################################################################
1382        # horizontal addition
1383
1384        vpsrldq         \$8,$D4,$T4
1385        vpsrldq         \$8,$D3,$T3
1386        vpsrldq         \$8,$D1,$T1
1387        vpsrldq         \$8,$D0,$T0
1388        vpsrldq         \$8,$D2,$T2
1389        vpaddq          $T3,$D3,$D3
1390        vpaddq          $T4,$D4,$D4
1391        vpaddq          $T0,$D0,$D0
1392        vpaddq          $T1,$D1,$D1
1393        vpaddq          $T2,$D2,$D2
1394
1395        ################################################################
1396        # lazy reduction
1397
1398        vpsrlq          \$26,$D3,$H3
1399        vpand           $MASK,$D3,$D3
1400        vpaddq          $H3,$D4,$D4             # h3 -> h4
1401
1402        vpsrlq          \$26,$D0,$H0
1403        vpand           $MASK,$D0,$D0
1404        vpaddq          $H0,$D1,$D1             # h0 -> h1
1405
1406        vpsrlq          \$26,$D4,$H4
1407        vpand           $MASK,$D4,$D4
1408
1409        vpsrlq          \$26,$D1,$H1
1410        vpand           $MASK,$D1,$D1
1411        vpaddq          $H1,$D2,$D2             # h1 -> h2
1412
1413        vpaddq          $H4,$D0,$D0
1414        vpsllq          \$2,$H4,$H4
1415        vpaddq          $H4,$D0,$D0             # h4 -> h0
1416
1417        vpsrlq          \$26,$D2,$H2
1418        vpand           $MASK,$D2,$D2
1419        vpaddq          $H2,$D3,$D3             # h2 -> h3
1420
1421        vpsrlq          \$26,$D0,$H0
1422        vpand           $MASK,$D0,$D0
1423        vpaddq          $H0,$D1,$D1             # h0 -> h1
1424
1425        vpsrlq          \$26,$D3,$H3
1426        vpand           $MASK,$D3,$D3
1427        vpaddq          $H3,$D4,$D4             # h3 -> h4
1428
1429        vmovd           $D0,`4*0-48-64`($ctx)   # save partially reduced
1430        vmovd           $D1,`4*1-48-64`($ctx)
1431        vmovd           $D2,`4*2-48-64`($ctx)
1432        vmovd           $D3,`4*3-48-64`($ctx)
1433        vmovd           $D4,`4*4-48-64`($ctx)
1434___
1435$code.=<<___    if ($win64);
1436        vmovdqa         0x50(%r11),%xmm6
1437        vmovdqa         0x60(%r11),%xmm7
1438        vmovdqa         0x70(%r11),%xmm8
1439        vmovdqa         0x80(%r11),%xmm9
1440        vmovdqa         0x90(%r11),%xmm10
1441        vmovdqa         0xa0(%r11),%xmm11
1442        vmovdqa         0xb0(%r11),%xmm12
1443        vmovdqa         0xc0(%r11),%xmm13
1444        vmovdqa         0xd0(%r11),%xmm14
1445        vmovdqa         0xe0(%r11),%xmm15
1446        lea             0xf8(%r11),%rsp
1447.Ldo_avx_epilogue:
1448___
1449$code.=<<___    if (!$win64);
1450        lea             -8(%r10),%rsp
1451.cfi_def_cfa_register   %rsp
1452___
1453$code.=<<___;
1454        vzeroupper
1455        ret
1456.cfi_endproc
1457___
1458&end_function("poly1305_blocks_avx");
1459
1460&declare_function("poly1305_emit_avx", 32, 3);
1461$code.=<<___;
1462        cmpl    \$0,20($ctx)    # is_base2_26?
1463        je      .Lemit
1464
1465        mov     0($ctx),%eax    # load hash value base 2^26
1466        mov     4($ctx),%ecx
1467        mov     8($ctx),%r8d
1468        mov     12($ctx),%r11d
1469        mov     16($ctx),%r10d
1470
1471        shl     \$26,%rcx       # base 2^26 -> base 2^64
1472        mov     %r8,%r9
1473        shl     \$52,%r8
1474        add     %rcx,%rax
1475        shr     \$12,%r9
1476        add     %rax,%r8        # h0
1477        adc     \$0,%r9
1478
1479        shl     \$14,%r11
1480        mov     %r10,%rax
1481        shr     \$24,%r10
1482        add     %r11,%r9
1483        shl     \$40,%rax
1484        add     %rax,%r9        # h1
1485        adc     \$0,%r10        # h2
1486
1487        mov     %r10,%rax       # could be partially reduced, so reduce
1488        mov     %r10,%rcx
1489        and     \$3,%r10
1490        shr     \$2,%rax
1491        and     \$-4,%rcx
1492        add     %rcx,%rax
1493        add     %rax,%r8
1494        adc     \$0,%r9
1495        adc     \$0,%r10
1496
1497        mov     %r8,%rax
1498        add     \$5,%r8         # compare to modulus
1499        mov     %r9,%rcx
1500        adc     \$0,%r9
1501        adc     \$0,%r10
1502        shr     \$2,%r10        # did 130-bit value overflow?
1503        cmovnz  %r8,%rax
1504        cmovnz  %r9,%rcx
1505
1506        add     0($nonce),%rax  # accumulate nonce
1507        adc     8($nonce),%rcx
1508        mov     %rax,0($mac)    # write result
1509        mov     %rcx,8($mac)
1510
1511        ret
1512___
1513&end_function("poly1305_emit_avx");
1514
1515if ($avx>1) {
1516
1517my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1518    map("%ymm$_",(0..15));
1519my $S4=$MASK;
1520
1521sub poly1305_blocks_avxN {
1522        my ($avx512) = @_;
1523        my $suffix = $avx512 ? "_avx512" : "";
1524$code.=<<___;
1525.cfi_startproc
1526        mov     20($ctx),%r8d           # is_base2_26
1527        cmp     \$128,$len
1528        jae     .Lblocks_avx2$suffix
1529        test    %r8d,%r8d
1530        jz      .Lblocks
1531
1532.Lblocks_avx2$suffix:
1533        and     \$-16,$len
1534        jz      .Lno_data_avx2$suffix
1535
1536        vzeroupper
1537
1538        test    %r8d,%r8d
1539        jz      .Lbase2_64_avx2$suffix
1540
1541        test    \$63,$len
1542        jz      .Leven_avx2$suffix
1543
1544        push    %rbp
1545.cfi_push       %rbp
1546        mov     %rsp,%rbp
1547        push    %rbx
1548.cfi_push       %rbx
1549        push    %r12
1550.cfi_push       %r12
1551        push    %r13
1552.cfi_push       %r13
1553        push    %r14
1554.cfi_push       %r14
1555        push    %r15
1556.cfi_push       %r15
1557.Lblocks_avx2_body$suffix:
1558
1559        mov     $len,%r15               # reassign $len
1560
1561        mov     0($ctx),$d1             # load hash value
1562        mov     8($ctx),$d2
1563        mov     16($ctx),$h2#d
1564
1565        mov     24($ctx),$r0            # load r
1566        mov     32($ctx),$s1
1567
1568        ################################# base 2^26 -> base 2^64
1569        mov     $d1#d,$h0#d
1570        and     \$`-1*(1<<31)`,$d1
1571        mov     $d2,$r1                 # borrow $r1
1572        mov     $d2#d,$h1#d
1573        and     \$`-1*(1<<31)`,$d2
1574
1575        shr     \$6,$d1
1576        shl     \$52,$r1
1577        add     $d1,$h0
1578        shr     \$12,$h1
1579        shr     \$18,$d2
1580        add     $r1,$h0
1581        adc     $d2,$h1
1582
1583        mov     $h2,$d1
1584        shl     \$40,$d1
1585        shr     \$24,$h2
1586        add     $d1,$h1
1587        adc     \$0,$h2                 # can be partially reduced...
1588
1589        mov     \$-4,$d2                # ... so reduce
1590        mov     $h2,$d1
1591        and     $h2,$d2
1592        shr     \$2,$d1
1593        and     \$3,$h2
1594        add     $d2,$d1                 # =*5
1595        add     $d1,$h0
1596        adc     \$0,$h1
1597        adc     \$0,$h2
1598
1599        mov     $s1,$r1
1600        mov     $s1,%rax
1601        shr     \$2,$s1
1602        add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1603
1604.Lbase2_26_pre_avx2$suffix:
1605        add     0($inp),$h0             # accumulate input
1606        adc     8($inp),$h1
1607        lea     16($inp),$inp
1608        adc     $padbit,$h2
1609        sub     \$16,%r15
1610
1611        call    __poly1305_block
1612        mov     $r1,%rax
1613
1614        test    \$63,%r15
1615        jnz     .Lbase2_26_pre_avx2$suffix
1616
1617        test    $padbit,$padbit         # if $padbit is zero,
1618        jz      .Lstore_base2_64_avx2$suffix    # store hash in base 2^64 format
1619
1620        ################################# base 2^64 -> base 2^26
1621        mov     $h0,%rax
1622        mov     $h0,%rdx
1623        shr     \$52,$h0
1624        mov     $h1,$r0
1625        mov     $h1,$r1
1626        shr     \$26,%rdx
1627        and     \$0x3ffffff,%rax        # h[0]
1628        shl     \$12,$r0
1629        and     \$0x3ffffff,%rdx        # h[1]
1630        shr     \$14,$h1
1631        or      $r0,$h0
1632        shl     \$24,$h2
1633        and     \$0x3ffffff,$h0         # h[2]
1634        shr     \$40,$r1
1635        and     \$0x3ffffff,$h1         # h[3]
1636        or      $r1,$h2                 # h[4]
1637
1638        test    %r15,%r15
1639        jz      .Lstore_base2_26_avx2$suffix
1640
1641        vmovd   %rax#d,%x#$H0
1642        vmovd   %rdx#d,%x#$H1
1643        vmovd   $h0#d,%x#$H2
1644        vmovd   $h1#d,%x#$H3
1645        vmovd   $h2#d,%x#$H4
1646        jmp     .Lproceed_avx2$suffix
1647
1648.align  32
1649.Lstore_base2_64_avx2$suffix:
1650        mov     $h0,0($ctx)
1651        mov     $h1,8($ctx)
1652        mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
1653        jmp     .Ldone_avx2$suffix
1654
1655.align  16
1656.Lstore_base2_26_avx2$suffix:
1657        mov     %rax#d,0($ctx)          # store hash value base 2^26
1658        mov     %rdx#d,4($ctx)
1659        mov     $h0#d,8($ctx)
1660        mov     $h1#d,12($ctx)
1661        mov     $h2#d,16($ctx)
1662.align  16
1663.Ldone_avx2$suffix:
1664        pop             %r15
1665.cfi_restore    %r15
1666        pop             %r14
1667.cfi_restore    %r14
1668        pop             %r13
1669.cfi_restore    %r13
1670        pop             %r12
1671.cfi_restore    %r12
1672        pop             %rbx
1673.cfi_restore    %rbx
1674        pop             %rbp
1675.cfi_restore    %rbp
1676.Lno_data_avx2$suffix:
1677.Lblocks_avx2_epilogue$suffix:
1678        ret
1679.cfi_endproc
1680
1681.align  32
1682.Lbase2_64_avx2$suffix:
1683.cfi_startproc
1684        push    %rbp
1685.cfi_push       %rbp
1686        mov     %rsp,%rbp
1687        push    %rbx
1688.cfi_push       %rbx
1689        push    %r12
1690.cfi_push       %r12
1691        push    %r13
1692.cfi_push       %r13
1693        push    %r14
1694.cfi_push       %r14
1695        push    %r15
1696.cfi_push       %r15
1697.Lbase2_64_avx2_body$suffix:
1698
1699        mov     $len,%r15               # reassign $len
1700
1701        mov     24($ctx),$r0            # load r
1702        mov     32($ctx),$s1
1703
1704        mov     0($ctx),$h0             # load hash value
1705        mov     8($ctx),$h1
1706        mov     16($ctx),$h2#d
1707
1708        mov     $s1,$r1
1709        mov     $s1,%rax
1710        shr     \$2,$s1
1711        add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1712
1713        test    \$63,$len
1714        jz      .Linit_avx2$suffix
1715
1716.Lbase2_64_pre_avx2$suffix:
1717        add     0($inp),$h0             # accumulate input
1718        adc     8($inp),$h1
1719        lea     16($inp),$inp
1720        adc     $padbit,$h2
1721        sub     \$16,%r15
1722
1723        call    __poly1305_block
1724        mov     $r1,%rax
1725
1726        test    \$63,%r15
1727        jnz     .Lbase2_64_pre_avx2$suffix
1728
1729.Linit_avx2$suffix:
1730        ################################# base 2^64 -> base 2^26
1731        mov     $h0,%rax
1732        mov     $h0,%rdx
1733        shr     \$52,$h0
1734        mov     $h1,$d1
1735        mov     $h1,$d2
1736        shr     \$26,%rdx
1737        and     \$0x3ffffff,%rax        # h[0]
1738        shl     \$12,$d1
1739        and     \$0x3ffffff,%rdx        # h[1]
1740        shr     \$14,$h1
1741        or      $d1,$h0
1742        shl     \$24,$h2
1743        and     \$0x3ffffff,$h0         # h[2]
1744        shr     \$40,$d2
1745        and     \$0x3ffffff,$h1         # h[3]
1746        or      $d2,$h2                 # h[4]
1747
1748        vmovd   %rax#d,%x#$H0
1749        vmovd   %rdx#d,%x#$H1
1750        vmovd   $h0#d,%x#$H2
1751        vmovd   $h1#d,%x#$H3
1752        vmovd   $h2#d,%x#$H4
1753        movl    \$1,20($ctx)            # set is_base2_26
1754
1755        call    __poly1305_init_avx
1756
1757.Lproceed_avx2$suffix:
1758        mov     %r15,$len                       # restore $len
1759___
1760$code.=<<___ if (!$kernel);
1761        mov     OPENSSL_ia32cap_P+8(%rip),%r9d
1762        mov     \$`(1<<31|1<<30|1<<16)`,%r11d
1763___
1764$code.=<<___;
1765        pop             %r15
1766.cfi_restore    %r15
1767        pop             %r14
1768.cfi_restore    %r14
1769        pop             %r13
1770.cfi_restore    %r13
1771        pop             %r12
1772.cfi_restore    %r12
1773        pop             %rbx
1774.cfi_restore    %rbx
1775        pop             %rbp
1776.cfi_restore    %rbp
1777.Lbase2_64_avx2_epilogue$suffix:
1778        jmp     .Ldo_avx2$suffix
1779.cfi_endproc
1780
1781.align  32
1782.Leven_avx2$suffix:
1783.cfi_startproc
1784___
1785$code.=<<___ if (!$kernel);
1786        mov             OPENSSL_ia32cap_P+8(%rip),%r9d
1787___
1788$code.=<<___;
1789        vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
1790        vmovd           4*1($ctx),%x#$H1
1791        vmovd           4*2($ctx),%x#$H2
1792        vmovd           4*3($ctx),%x#$H3
1793        vmovd           4*4($ctx),%x#$H4
1794
1795.Ldo_avx2$suffix:
1796___
1797$code.=<<___            if (!$kernel && $avx>2);
1798        cmp             \$512,$len
1799        jb              .Lskip_avx512
1800        and             %r11d,%r9d
1801        test            \$`1<<16`,%r9d          # check for AVX512F
1802        jnz             .Lblocks_avx512
1803.Lskip_avx512$suffix:
1804___
1805$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
1806        cmp             \$512,$len
1807        jae             .Lblocks_avx512
1808___
1809$code.=<<___    if (!$win64);
1810        lea             8(%rsp),%r10
1811.cfi_def_cfa_register   %r10
1812        sub             \$0x128,%rsp
1813___
1814$code.=<<___    if ($win64);
1815        lea             8(%rsp),%r10
1816        sub             \$0x1c8,%rsp
1817        vmovdqa         %xmm6,-0xb0(%r10)
1818        vmovdqa         %xmm7,-0xa0(%r10)
1819        vmovdqa         %xmm8,-0x90(%r10)
1820        vmovdqa         %xmm9,-0x80(%r10)
1821        vmovdqa         %xmm10,-0x70(%r10)
1822        vmovdqa         %xmm11,-0x60(%r10)
1823        vmovdqa         %xmm12,-0x50(%r10)
1824        vmovdqa         %xmm13,-0x40(%r10)
1825        vmovdqa         %xmm14,-0x30(%r10)
1826        vmovdqa         %xmm15,-0x20(%r10)
1827.Ldo_avx2_body$suffix:
1828___
1829$code.=<<___;
1830        lea             .Lconst(%rip),%rcx
1831        lea             48+64($ctx),$ctx        # size optimization
1832        vmovdqa         96(%rcx),$T0            # .Lpermd_avx2
1833
1834        # expand and copy pre-calculated table to stack
1835        vmovdqu         `16*0-64`($ctx),%x#$T2
1836        and             \$-512,%rsp
1837        vmovdqu         `16*1-64`($ctx),%x#$T3
1838        vmovdqu         `16*2-64`($ctx),%x#$T4
1839        vmovdqu         `16*3-64`($ctx),%x#$D0
1840        vmovdqu         `16*4-64`($ctx),%x#$D1
1841        vmovdqu         `16*5-64`($ctx),%x#$D2
1842        lea             0x90(%rsp),%rax         # size optimization
1843        vmovdqu         `16*6-64`($ctx),%x#$D3
1844        vpermd          $T2,$T0,$T2             # 00003412 -> 14243444
1845        vmovdqu         `16*7-64`($ctx),%x#$D4
1846        vpermd          $T3,$T0,$T3
1847        vmovdqu         `16*8-64`($ctx),%x#$MASK
1848        vpermd          $T4,$T0,$T4
1849        vmovdqa         $T2,0x00(%rsp)
1850        vpermd          $D0,$T0,$D0
1851        vmovdqa         $T3,0x20-0x90(%rax)
1852        vpermd          $D1,$T0,$D1
1853        vmovdqa         $T4,0x40-0x90(%rax)
1854        vpermd          $D2,$T0,$D2
1855        vmovdqa         $D0,0x60-0x90(%rax)
1856        vpermd          $D3,$T0,$D3
1857        vmovdqa         $D1,0x80-0x90(%rax)
1858        vpermd          $D4,$T0,$D4
1859        vmovdqa         $D2,0xa0-0x90(%rax)
1860        vpermd          $MASK,$T0,$MASK
1861        vmovdqa         $D3,0xc0-0x90(%rax)
1862        vmovdqa         $D4,0xe0-0x90(%rax)
1863        vmovdqa         $MASK,0x100-0x90(%rax)
1864        vmovdqa         64(%rcx),$MASK          # .Lmask26
1865
1866        ################################################################
1867        # load input
1868        vmovdqu         16*0($inp),%x#$T0
1869        vmovdqu         16*1($inp),%x#$T1
1870        vinserti128     \$1,16*2($inp),$T0,$T0
1871        vinserti128     \$1,16*3($inp),$T1,$T1
1872        lea             16*4($inp),$inp
1873
1874        vpsrldq         \$6,$T0,$T2             # splat input
1875        vpsrldq         \$6,$T1,$T3
1876        vpunpckhqdq     $T1,$T0,$T4             # 4
1877        vpunpcklqdq     $T3,$T2,$T2             # 2:3
1878        vpunpcklqdq     $T1,$T0,$T0             # 0:1
1879
1880        vpsrlq          \$30,$T2,$T3
1881        vpsrlq          \$4,$T2,$T2
1882        vpsrlq          \$26,$T0,$T1
1883        vpsrlq          \$40,$T4,$T4            # 4
1884        vpand           $MASK,$T2,$T2           # 2
1885        vpand           $MASK,$T0,$T0           # 0
1886        vpand           $MASK,$T1,$T1           # 1
1887        vpand           $MASK,$T3,$T3           # 3
1888        vpor            32(%rcx),$T4,$T4        # padbit, yes, always
1889
1890        vpaddq          $H2,$T2,$H2             # accumulate input
1891        sub             \$64,$len
1892        jz              .Ltail_avx2$suffix
1893        jmp             .Loop_avx2$suffix
1894
1895.align  32
1896.Loop_avx2$suffix:
1897        ################################################################
1898        # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1899        # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1900        # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1901        # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1902        #   \________/\__________/
1903        ################################################################
1904        #vpaddq         $H2,$T2,$H2             # accumulate input
1905        vpaddq          $H0,$T0,$H0
1906        vmovdqa         `32*0`(%rsp),$T0        # r0^4
1907        vpaddq          $H1,$T1,$H1
1908        vmovdqa         `32*1`(%rsp),$T1        # r1^4
1909        vpaddq          $H3,$T3,$H3
1910        vmovdqa         `32*3`(%rsp),$T2        # r2^4
1911        vpaddq          $H4,$T4,$H4
1912        vmovdqa         `32*6-0x90`(%rax),$T3   # s3^4
1913        vmovdqa         `32*8-0x90`(%rax),$S4   # s4^4
1914
1915        # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1916        # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1917        # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1918        # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1919        # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1920        #
1921        # however, as h2 is "chronologically" first one available pull
1922        # corresponding operations up, so it's
1923        #
1924        # d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
1925        # d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
1926        # d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1927        # d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
1928        # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
1929
1930        vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
1931        vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
1932        vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
1933        vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
1934        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
1935
1936        vpmuludq        $H0,$T1,$T4             # h0*r1
1937        vpmuludq        $H1,$T1,$H2             # h1*r1, borrow $H2 as temp
1938        vpaddq          $T4,$D1,$D1             # d1 += h0*r1
1939        vpaddq          $H2,$D2,$D2             # d2 += h1*r1
1940        vpmuludq        $H3,$T1,$T4             # h3*r1
1941        vpmuludq        `32*2`(%rsp),$H4,$H2    # h4*s1
1942        vpaddq          $T4,$D4,$D4             # d4 += h3*r1
1943        vpaddq          $H2,$D0,$D0             # d0 += h4*s1
1944         vmovdqa        `32*4-0x90`(%rax),$T1   # s2
1945
1946        vpmuludq        $H0,$T0,$T4             # h0*r0
1947        vpmuludq        $H1,$T0,$H2             # h1*r0
1948        vpaddq          $T4,$D0,$D0             # d0 += h0*r0
1949        vpaddq          $H2,$D1,$D1             # d1 += h1*r0
1950        vpmuludq        $H3,$T0,$T4             # h3*r0
1951        vpmuludq        $H4,$T0,$H2             # h4*r0
1952         vmovdqu        16*0($inp),%x#$T0       # load input
1953        vpaddq          $T4,$D3,$D3             # d3 += h3*r0
1954        vpaddq          $H2,$D4,$D4             # d4 += h4*r0
1955         vinserti128    \$1,16*2($inp),$T0,$T0
1956
1957        vpmuludq        $H3,$T1,$T4             # h3*s2
1958        vpmuludq        $H4,$T1,$H2             # h4*s2
1959         vmovdqu        16*1($inp),%x#$T1
1960        vpaddq          $T4,$D0,$D0             # d0 += h3*s2
1961        vpaddq          $H2,$D1,$D1             # d1 += h4*s2
1962         vmovdqa        `32*5-0x90`(%rax),$H2   # r3
1963        vpmuludq        $H1,$T2,$T4             # h1*r2
1964        vpmuludq        $H0,$T2,$T2             # h0*r2
1965        vpaddq          $T4,$D3,$D3             # d3 += h1*r2
1966        vpaddq          $T2,$D2,$D2             # d2 += h0*r2
1967         vinserti128    \$1,16*3($inp),$T1,$T1
1968         lea            16*4($inp),$inp
1969
1970        vpmuludq        $H1,$H2,$T4             # h1*r3
1971        vpmuludq        $H0,$H2,$H2             # h0*r3
1972         vpsrldq        \$6,$T0,$T2             # splat input
1973        vpaddq          $T4,$D4,$D4             # d4 += h1*r3
1974        vpaddq          $H2,$D3,$D3             # d3 += h0*r3
1975        vpmuludq        $H3,$T3,$T4             # h3*s3
1976        vpmuludq        $H4,$T3,$H2             # h4*s3
1977         vpsrldq        \$6,$T1,$T3
1978        vpaddq          $T4,$D1,$D1             # d1 += h3*s3
1979        vpaddq          $H2,$D2,$D2             # d2 += h4*s3
1980         vpunpckhqdq    $T1,$T0,$T4             # 4
1981
1982        vpmuludq        $H3,$S4,$H3             # h3*s4
1983        vpmuludq        $H4,$S4,$H4             # h4*s4
1984         vpunpcklqdq    $T1,$T0,$T0             # 0:1
1985        vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
1986        vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
1987         vpunpcklqdq    $T3,$T2,$T3             # 2:3
1988        vpmuludq        `32*7-0x90`(%rax),$H0,$H4       # h0*r4
1989        vpmuludq        $H1,$S4,$H0             # h1*s4
1990        vmovdqa         64(%rcx),$MASK          # .Lmask26
1991        vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
1992        vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
1993
1994        ################################################################
1995        # lazy reduction (interleaved with tail of input splat)
1996
1997        vpsrlq          \$26,$H3,$D3
1998        vpand           $MASK,$H3,$H3
1999        vpaddq          $D3,$H4,$H4             # h3 -> h4
2000
2001        vpsrlq          \$26,$H0,$D0
2002        vpand           $MASK,$H0,$H0
2003        vpaddq          $D0,$D1,$H1             # h0 -> h1
2004
2005        vpsrlq          \$26,$H4,$D4
2006        vpand           $MASK,$H4,$H4
2007
2008         vpsrlq         \$4,$T3,$T2
2009
2010        vpsrlq          \$26,$H1,$D1
2011        vpand           $MASK,$H1,$H1
2012        vpaddq          $D1,$H2,$H2             # h1 -> h2
2013
2014        vpaddq          $D4,$H0,$H0
2015        vpsllq          \$2,$D4,$D4
2016        vpaddq          $D4,$H0,$H0             # h4 -> h0
2017
2018         vpand          $MASK,$T2,$T2           # 2
2019         vpsrlq         \$26,$T0,$T1
2020
2021        vpsrlq          \$26,$H2,$D2
2022        vpand           $MASK,$H2,$H2
2023        vpaddq          $D2,$H3,$H3             # h2 -> h3
2024
2025         vpaddq         $T2,$H2,$H2             # modulo-scheduled
2026         vpsrlq         \$30,$T3,$T3
2027
2028        vpsrlq          \$26,$H0,$D0
2029        vpand           $MASK,$H0,$H0
2030        vpaddq          $D0,$H1,$H1             # h0 -> h1
2031
2032         vpsrlq         \$40,$T4,$T4            # 4
2033
2034        vpsrlq          \$26,$H3,$D3
2035        vpand           $MASK,$H3,$H3
2036        vpaddq          $D3,$H4,$H4             # h3 -> h4
2037
2038         vpand          $MASK,$T0,$T0           # 0
2039         vpand          $MASK,$T1,$T1           # 1
2040         vpand          $MASK,$T3,$T3           # 3
2041         vpor           32(%rcx),$T4,$T4        # padbit, yes, always
2042
2043        sub             \$64,$len
2044        jnz             .Loop_avx2$suffix
2045
2046        .byte           0x66,0x90
2047.Ltail_avx2$suffix:
2048        ################################################################
2049        # while above multiplications were by r^4 in all lanes, in last
2050        # iteration we multiply least significant lane by r^4 and most
2051        # significant one by r, so copy of above except that references
2052        # to the precomputed table are displaced by 4...
2053
2054        #vpaddq         $H2,$T2,$H2             # accumulate input
2055        vpaddq          $H0,$T0,$H0
2056        vmovdqu         `32*0+4`(%rsp),$T0      # r0^4
2057        vpaddq          $H1,$T1,$H1
2058        vmovdqu         `32*1+4`(%rsp),$T1      # r1^4
2059        vpaddq          $H3,$T3,$H3
2060        vmovdqu         `32*3+4`(%rsp),$T2      # r2^4
2061        vpaddq          $H4,$T4,$H4
2062        vmovdqu         `32*6+4-0x90`(%rax),$T3 # s3^4
2063        vmovdqu         `32*8+4-0x90`(%rax),$S4 # s4^4
2064
2065        vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
2066        vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
2067        vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
2068        vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
2069        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
2070
2071        vpmuludq        $H0,$T1,$T4             # h0*r1
2072        vpmuludq        $H1,$T1,$H2             # h1*r1
2073        vpaddq          $T4,$D1,$D1             # d1 += h0*r1
2074        vpaddq          $H2,$D2,$D2             # d2 += h1*r1
2075        vpmuludq        $H3,$T1,$T4             # h3*r1
2076        vpmuludq        `32*2+4`(%rsp),$H4,$H2  # h4*s1
2077        vpaddq          $T4,$D4,$D4             # d4 += h3*r1
2078        vpaddq          $H2,$D0,$D0             # d0 += h4*s1
2079
2080        vpmuludq        $H0,$T0,$T4             # h0*r0
2081        vpmuludq        $H1,$T0,$H2             # h1*r0
2082        vpaddq          $T4,$D0,$D0             # d0 += h0*r0
2083         vmovdqu        `32*4+4-0x90`(%rax),$T1 # s2
2084        vpaddq          $H2,$D1,$D1             # d1 += h1*r0
2085        vpmuludq        $H3,$T0,$T4             # h3*r0
2086        vpmuludq        $H4,$T0,$H2             # h4*r0
2087        vpaddq          $T4,$D3,$D3             # d3 += h3*r0
2088        vpaddq          $H2,$D4,$D4             # d4 += h4*r0
2089
2090        vpmuludq        $H3,$T1,$T4             # h3*s2
2091        vpmuludq        $H4,$T1,$H2             # h4*s2
2092        vpaddq          $T4,$D0,$D0             # d0 += h3*s2
2093        vpaddq          $H2,$D1,$D1             # d1 += h4*s2
2094         vmovdqu        `32*5+4-0x90`(%rax),$H2 # r3
2095        vpmuludq        $H1,$T2,$T4             # h1*r2
2096        vpmuludq        $H0,$T2,$T2             # h0*r2
2097        vpaddq          $T4,$D3,$D3             # d3 += h1*r2
2098        vpaddq          $T2,$D2,$D2             # d2 += h0*r2
2099
2100        vpmuludq        $H1,$H2,$T4             # h1*r3
2101        vpmuludq        $H0,$H2,$H2             # h0*r3
2102        vpaddq          $T4,$D4,$D4             # d4 += h1*r3
2103        vpaddq          $H2,$D3,$D3             # d3 += h0*r3
2104        vpmuludq        $H3,$T3,$T4             # h3*s3
2105        vpmuludq        $H4,$T3,$H2             # h4*s3
2106        vpaddq          $T4,$D1,$D1             # d1 += h3*s3
2107        vpaddq          $H2,$D2,$D2             # d2 += h4*s3
2108
2109        vpmuludq        $H3,$S4,$H3             # h3*s4
2110        vpmuludq        $H4,$S4,$H4             # h4*s4
2111        vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
2112        vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
2113        vpmuludq        `32*7+4-0x90`(%rax),$H0,$H4             # h0*r4
2114        vpmuludq        $H1,$S4,$H0             # h1*s4
2115        vmovdqa         64(%rcx),$MASK          # .Lmask26
2116        vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
2117        vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
2118
2119        ################################################################
2120        # horizontal addition
2121
2122        vpsrldq         \$8,$D1,$T1
2123        vpsrldq         \$8,$H2,$T2
2124        vpsrldq         \$8,$H3,$T3
2125        vpsrldq         \$8,$H4,$T4
2126        vpsrldq         \$8,$H0,$T0
2127        vpaddq          $T1,$D1,$D1
2128        vpaddq          $T2,$H2,$H2
2129        vpaddq          $T3,$H3,$H3
2130        vpaddq          $T4,$H4,$H4
2131        vpaddq          $T0,$H0,$H0
2132
2133        vpermq          \$0x2,$H3,$T3
2134        vpermq          \$0x2,$H4,$T4
2135        vpermq          \$0x2,$H0,$T0
2136        vpermq          \$0x2,$D1,$T1
2137        vpermq          \$0x2,$H2,$T2
2138        vpaddq          $T3,$H3,$H3
2139        vpaddq          $T4,$H4,$H4
2140        vpaddq          $T0,$H0,$H0
2141        vpaddq          $T1,$D1,$D1
2142        vpaddq          $T2,$H2,$H2
2143
2144        ################################################################
2145        # lazy reduction
2146
2147        vpsrlq          \$26,$H3,$D3
2148        vpand           $MASK,$H3,$H3
2149        vpaddq          $D3,$H4,$H4             # h3 -> h4
2150
2151        vpsrlq          \$26,$H0,$D0
2152        vpand           $MASK,$H0,$H0
2153        vpaddq          $D0,$D1,$H1             # h0 -> h1
2154
2155        vpsrlq          \$26,$H4,$D4
2156        vpand           $MASK,$H4,$H4
2157
2158        vpsrlq          \$26,$H1,$D1
2159        vpand           $MASK,$H1,$H1
2160        vpaddq          $D1,$H2,$H2             # h1 -> h2
2161
2162        vpaddq          $D4,$H0,$H0
2163        vpsllq          \$2,$D4,$D4
2164        vpaddq          $D4,$H0,$H0             # h4 -> h0
2165
2166        vpsrlq          \$26,$H2,$D2
2167        vpand           $MASK,$H2,$H2
2168        vpaddq          $D2,$H3,$H3             # h2 -> h3
2169
2170        vpsrlq          \$26,$H0,$D0
2171        vpand           $MASK,$H0,$H0
2172        vpaddq          $D0,$H1,$H1             # h0 -> h1
2173
2174        vpsrlq          \$26,$H3,$D3
2175        vpand           $MASK,$H3,$H3
2176        vpaddq          $D3,$H4,$H4             # h3 -> h4
2177
2178        vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2179        vmovd           %x#$H1,`4*1-48-64`($ctx)
2180        vmovd           %x#$H2,`4*2-48-64`($ctx)
2181        vmovd           %x#$H3,`4*3-48-64`($ctx)
2182        vmovd           %x#$H4,`4*4-48-64`($ctx)
2183___
2184$code.=<<___    if ($win64);
2185        vmovdqa         -0xb0(%r10),%xmm6
2186        vmovdqa         -0xa0(%r10),%xmm7
2187        vmovdqa         -0x90(%r10),%xmm8
2188        vmovdqa         -0x80(%r10),%xmm9
2189        vmovdqa         -0x70(%r10),%xmm10
2190        vmovdqa         -0x60(%r10),%xmm11
2191        vmovdqa         -0x50(%r10),%xmm12
2192        vmovdqa         -0x40(%r10),%xmm13
2193        vmovdqa         -0x30(%r10),%xmm14
2194        vmovdqa         -0x20(%r10),%xmm15
2195        lea             -8(%r10),%rsp
2196.Ldo_avx2_epilogue$suffix:
2197___
2198$code.=<<___    if (!$win64);
2199        lea             -8(%r10),%rsp
2200.cfi_def_cfa_register   %rsp
2201___
2202$code.=<<___;
2203        vzeroupper
2204        ret
2205.cfi_endproc
2206___
2207if($avx > 2 && $avx512) {
2208my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2209my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2210my $PADBIT="%zmm30";
2211
2212map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));            # switch to %zmm domain
2213map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2214map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2215map(s/%y/%z/,($MASK));
2216
2217$code.=<<___;
2218.cfi_startproc
2219.Lblocks_avx512:
2220        mov             \$15,%eax
2221        kmovw           %eax,%k2
2222___
2223$code.=<<___    if (!$win64);
2224        lea             8(%rsp),%r10
2225.cfi_def_cfa_register   %r10
2226        sub             \$0x128,%rsp
2227___
2228$code.=<<___    if ($win64);
2229        lea             8(%rsp),%r10
2230        sub             \$0x1c8,%rsp
2231        vmovdqa         %xmm6,-0xb0(%r10)
2232        vmovdqa         %xmm7,-0xa0(%r10)
2233        vmovdqa         %xmm8,-0x90(%r10)
2234        vmovdqa         %xmm9,-0x80(%r10)
2235        vmovdqa         %xmm10,-0x70(%r10)
2236        vmovdqa         %xmm11,-0x60(%r10)
2237        vmovdqa         %xmm12,-0x50(%r10)
2238        vmovdqa         %xmm13,-0x40(%r10)
2239        vmovdqa         %xmm14,-0x30(%r10)
2240        vmovdqa         %xmm15,-0x20(%r10)
2241.Ldo_avx512_body:
2242___
2243$code.=<<___;
2244        lea             .Lconst(%rip),%rcx
2245        lea             48+64($ctx),$ctx        # size optimization
2246        vmovdqa         96(%rcx),%y#$T2         # .Lpermd_avx2
2247
2248        # expand pre-calculated table
2249        vmovdqu         `16*0-64`($ctx),%x#$D0  # will become expanded ${R0}
2250        and             \$-512,%rsp
2251        vmovdqu         `16*1-64`($ctx),%x#$D1  # will become ... ${R1}
2252        mov             \$0x20,%rax
2253        vmovdqu         `16*2-64`($ctx),%x#$T0  # ... ${S1}
2254        vmovdqu         `16*3-64`($ctx),%x#$D2  # ... ${R2}
2255        vmovdqu         `16*4-64`($ctx),%x#$T1  # ... ${S2}
2256        vmovdqu         `16*5-64`($ctx),%x#$D3  # ... ${R3}
2257        vmovdqu         `16*6-64`($ctx),%x#$T3  # ... ${S3}
2258        vmovdqu         `16*7-64`($ctx),%x#$D4  # ... ${R4}
2259        vmovdqu         `16*8-64`($ctx),%x#$T4  # ... ${S4}
2260        vpermd          $D0,$T2,$R0             # 00003412 -> 14243444
2261        vpbroadcastq    64(%rcx),$MASK          # .Lmask26
2262        vpermd          $D1,$T2,$R1
2263        vpermd          $T0,$T2,$S1
2264        vpermd          $D2,$T2,$R2
2265        vmovdqa64       $R0,0x00(%rsp){%k2}     # save in case $len%128 != 0
2266         vpsrlq         \$32,$R0,$T0            # 14243444 -> 01020304
2267        vpermd          $T1,$T2,$S2
2268        vmovdqu64       $R1,0x00(%rsp,%rax){%k2}
2269         vpsrlq         \$32,$R1,$T1
2270        vpermd          $D3,$T2,$R3
2271        vmovdqa64       $S1,0x40(%rsp){%k2}
2272        vpermd          $T3,$T2,$S3
2273        vpermd          $D4,$T2,$R4
2274        vmovdqu64       $R2,0x40(%rsp,%rax){%k2}
2275        vpermd          $T4,$T2,$S4
2276        vmovdqa64       $S2,0x80(%rsp){%k2}
2277        vmovdqu64       $R3,0x80(%rsp,%rax){%k2}
2278        vmovdqa64       $S3,0xc0(%rsp){%k2}
2279        vmovdqu64       $R4,0xc0(%rsp,%rax){%k2}
2280        vmovdqa64       $S4,0x100(%rsp){%k2}
2281
2282        ################################################################
2283        # calculate 5th through 8th powers of the key
2284        #
2285        # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2286        # d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2287        # d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
2288        # d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
2289        # d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
2290
2291        vpmuludq        $T0,$R0,$D0             # d0 = r0'*r0
2292        vpmuludq        $T0,$R1,$D1             # d1 = r0'*r1
2293        vpmuludq        $T0,$R2,$D2             # d2 = r0'*r2
2294        vpmuludq        $T0,$R3,$D3             # d3 = r0'*r3
2295        vpmuludq        $T0,$R4,$D4             # d4 = r0'*r4
2296         vpsrlq         \$32,$R2,$T2
2297
2298        vpmuludq        $T1,$S4,$M0
2299        vpmuludq        $T1,$R0,$M1
2300        vpmuludq        $T1,$R1,$M2
2301        vpmuludq        $T1,$R2,$M3
2302        vpmuludq        $T1,$R3,$M4
2303         vpsrlq         \$32,$R3,$T3
2304        vpaddq          $M0,$D0,$D0             # d0 += r1'*5*r4
2305        vpaddq          $M1,$D1,$D1             # d1 += r1'*r0
2306        vpaddq          $M2,$D2,$D2             # d2 += r1'*r1
2307        vpaddq          $M3,$D3,$D3             # d3 += r1'*r2
2308        vpaddq          $M4,$D4,$D4             # d4 += r1'*r3
2309
2310        vpmuludq        $T2,$S3,$M0
2311        vpmuludq        $T2,$S4,$M1
2312        vpmuludq        $T2,$R1,$M3
2313        vpmuludq        $T2,$R2,$M4
2314        vpmuludq        $T2,$R0,$M2
2315         vpsrlq         \$32,$R4,$T4
2316        vpaddq          $M0,$D0,$D0             # d0 += r2'*5*r3
2317        vpaddq          $M1,$D1,$D1             # d1 += r2'*5*r4
2318        vpaddq          $M3,$D3,$D3             # d3 += r2'*r1
2319        vpaddq          $M4,$D4,$D4             # d4 += r2'*r2
2320        vpaddq          $M2,$D2,$D2             # d2 += r2'*r0
2321
2322        vpmuludq        $T3,$S2,$M0
2323        vpmuludq        $T3,$R0,$M3
2324        vpmuludq        $T3,$R1,$M4
2325        vpmuludq        $T3,$S3,$M1
2326        vpmuludq        $T3,$S4,$M2
2327        vpaddq          $M0,$D0,$D0             # d0 += r3'*5*r2
2328        vpaddq          $M3,$D3,$D3             # d3 += r3'*r0
2329        vpaddq          $M4,$D4,$D4             # d4 += r3'*r1
2330        vpaddq          $M1,$D1,$D1             # d1 += r3'*5*r3
2331        vpaddq          $M2,$D2,$D2             # d2 += r3'*5*r4
2332
2333        vpmuludq        $T4,$S4,$M3
2334        vpmuludq        $T4,$R0,$M4
2335        vpmuludq        $T4,$S1,$M0
2336        vpmuludq        $T4,$S2,$M1
2337        vpmuludq        $T4,$S3,$M2
2338        vpaddq          $M3,$D3,$D3             # d3 += r2'*5*r4
2339        vpaddq          $M4,$D4,$D4             # d4 += r2'*r0
2340        vpaddq          $M0,$D0,$D0             # d0 += r2'*5*r1
2341        vpaddq          $M1,$D1,$D1             # d1 += r2'*5*r2
2342        vpaddq          $M2,$D2,$D2             # d2 += r2'*5*r3
2343
2344        ################################################################
2345        # load input
2346        vmovdqu64       16*0($inp),%z#$T3
2347        vmovdqu64       16*4($inp),%z#$T4
2348        lea             16*8($inp),$inp
2349
2350        ################################################################
2351        # lazy reduction
2352
2353        vpsrlq          \$26,$D3,$M3
2354        vpandq          $MASK,$D3,$D3
2355        vpaddq          $M3,$D4,$D4             # d3 -> d4
2356
2357        vpsrlq          \$26,$D0,$M0
2358        vpandq          $MASK,$D0,$D0
2359        vpaddq          $M0,$D1,$D1             # d0 -> d1
2360
2361        vpsrlq          \$26,$D4,$M4
2362        vpandq          $MASK,$D4,$D4
2363
2364        vpsrlq          \$26,$D1,$M1
2365        vpandq          $MASK,$D1,$D1
2366        vpaddq          $M1,$D2,$D2             # d1 -> d2
2367
2368        vpaddq          $M4,$D0,$D0
2369        vpsllq          \$2,$M4,$M4
2370        vpaddq          $M4,$D0,$D0             # d4 -> d0
2371
2372        vpsrlq          \$26,$D2,$M2
2373        vpandq          $MASK,$D2,$D2
2374        vpaddq          $M2,$D3,$D3             # d2 -> d3
2375
2376        vpsrlq          \$26,$D0,$M0
2377        vpandq          $MASK,$D0,$D0
2378        vpaddq          $M0,$D1,$D1             # d0 -> d1
2379
2380        vpsrlq          \$26,$D3,$M3
2381        vpandq          $MASK,$D3,$D3
2382        vpaddq          $M3,$D4,$D4             # d3 -> d4
2383
2384        ################################################################
2385        # at this point we have 14243444 in $R0-$S4 and 05060708 in
2386        # $D0-$D4, ...
2387
2388        vpunpcklqdq     $T4,$T3,$T0     # transpose input
2389        vpunpckhqdq     $T4,$T3,$T4
2390
2391        # ... since input 64-bit lanes are ordered as 73625140, we could
2392        # "vperm" it to 76543210 (here and in each loop iteration), *or*
2393        # we could just flow along, hence the goal for $R0-$S4 is
2394        # 1858286838784888 ...
2395
2396        vmovdqa32       128(%rcx),$M0           # .Lpermd_avx512:
2397        mov             \$0x7777,%eax
2398        kmovw           %eax,%k1
2399
2400        vpermd          $R0,$M0,$R0             # 14243444 -> 1---2---3---4---
2401        vpermd          $R1,$M0,$R1
2402        vpermd          $R2,$M0,$R2
2403        vpermd          $R3,$M0,$R3
2404        vpermd          $R4,$M0,$R4
2405
2406        vpermd          $D0,$M0,${R0}{%k1}      # 05060708 -> 1858286838784888
2407        vpermd          $D1,$M0,${R1}{%k1}
2408        vpermd          $D2,$M0,${R2}{%k1}
2409        vpermd          $D3,$M0,${R3}{%k1}
2410        vpermd          $D4,$M0,${R4}{%k1}
2411
2412        vpslld          \$2,$R1,$S1             # *5
2413        vpslld          \$2,$R2,$S2
2414        vpslld          \$2,$R3,$S3
2415        vpslld          \$2,$R4,$S4
2416        vpaddd          $R1,$S1,$S1
2417        vpaddd          $R2,$S2,$S2
2418        vpaddd          $R3,$S3,$S3
2419        vpaddd          $R4,$S4,$S4
2420
2421        vpbroadcastq    32(%rcx),$PADBIT        # .L129
2422
2423        vpsrlq          \$52,$T0,$T2            # splat input
2424        vpsllq          \$12,$T4,$T3
2425        vporq           $T3,$T2,$T2
2426        vpsrlq          \$26,$T0,$T1
2427        vpsrlq          \$14,$T4,$T3
2428        vpsrlq          \$40,$T4,$T4            # 4
2429        vpandq          $MASK,$T2,$T2           # 2
2430        vpandq          $MASK,$T0,$T0           # 0
2431        #vpandq         $MASK,$T1,$T1           # 1
2432        #vpandq         $MASK,$T3,$T3           # 3
2433        #vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2434
2435        vpaddq          $H2,$T2,$H2             # accumulate input
2436        sub             \$192,$len
2437        jbe             .Ltail_avx512
2438        jmp             .Loop_avx512
2439
2440.align  32
2441.Loop_avx512:
2442        ################################################################
2443        # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2444        # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2445        # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2446        # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2447        # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2448        # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2449        # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2450        # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2451        #   \________/\___________/
2452        ################################################################
2453        #vpaddq         $H2,$T2,$H2             # accumulate input
2454
2455        # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
2456        # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
2457        # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
2458        # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
2459        # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2460        #
2461        # however, as h2 is "chronologically" first one available pull
2462        # corresponding operations up, so it's
2463        #
2464        # d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
2465        # d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
2466        # d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
2467        # d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
2468        # d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
2469
2470        vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
2471         vpaddq         $H0,$T0,$H0
2472        vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
2473         vpandq         $MASK,$T1,$T1           # 1
2474        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
2475         vpandq         $MASK,$T3,$T3           # 3
2476        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
2477         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2478        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
2479         vpaddq         $H1,$T1,$H1             # accumulate input
2480         vpaddq         $H3,$T3,$H3
2481         vpaddq         $H4,$T4,$H4
2482
2483          vmovdqu64     16*0($inp),$T3          # load input
2484          vmovdqu64     16*4($inp),$T4
2485          lea           16*8($inp),$inp
2486        vpmuludq        $H0,$R3,$M3
2487        vpmuludq        $H0,$R4,$M4
2488        vpmuludq        $H0,$R0,$M0
2489        vpmuludq        $H0,$R1,$M1
2490        vpaddq          $M3,$D3,$D3             # d3 += h0*r3
2491        vpaddq          $M4,$D4,$D4             # d4 += h0*r4
2492        vpaddq          $M0,$D0,$D0             # d0 += h0*r0
2493        vpaddq          $M1,$D1,$D1             # d1 += h0*r1
2494
2495        vpmuludq        $H1,$R2,$M3
2496        vpmuludq        $H1,$R3,$M4
2497        vpmuludq        $H1,$S4,$M0
2498        vpmuludq        $H0,$R2,$M2
2499        vpaddq          $M3,$D3,$D3             # d3 += h1*r2
2500        vpaddq          $M4,$D4,$D4             # d4 += h1*r3
2501        vpaddq          $M0,$D0,$D0             # d0 += h1*s4
2502        vpaddq          $M2,$D2,$D2             # d2 += h0*r2
2503
2504          vpunpcklqdq   $T4,$T3,$T0             # transpose input
2505          vpunpckhqdq   $T4,$T3,$T4
2506
2507        vpmuludq        $H3,$R0,$M3
2508        vpmuludq        $H3,$R1,$M4
2509        vpmuludq        $H1,$R0,$M1
2510        vpmuludq        $H1,$R1,$M2
2511        vpaddq          $M3,$D3,$D3             # d3 += h3*r0
2512        vpaddq          $M4,$D4,$D4             # d4 += h3*r1
2513        vpaddq          $M1,$D1,$D1             # d1 += h1*r0
2514        vpaddq          $M2,$D2,$D2             # d2 += h1*r1
2515
2516        vpmuludq        $H4,$S4,$M3
2517        vpmuludq        $H4,$R0,$M4
2518        vpmuludq        $H3,$S2,$M0
2519        vpmuludq        $H3,$S3,$M1
2520        vpaddq          $M3,$D3,$D3             # d3 += h4*s4
2521        vpmuludq        $H3,$S4,$M2
2522        vpaddq          $M4,$D4,$D4             # d4 += h4*r0
2523        vpaddq          $M0,$D0,$D0             # d0 += h3*s2
2524        vpaddq          $M1,$D1,$D1             # d1 += h3*s3
2525        vpaddq          $M2,$D2,$D2             # d2 += h3*s4
2526
2527        vpmuludq        $H4,$S1,$M0
2528        vpmuludq        $H4,$S2,$M1
2529        vpmuludq        $H4,$S3,$M2
2530        vpaddq          $M0,$D0,$H0             # h0 = d0 + h4*s1
2531        vpaddq          $M1,$D1,$H1             # h1 = d2 + h4*s2
2532        vpaddq          $M2,$D2,$H2             # h2 = d3 + h4*s3
2533
2534        ################################################################
2535        # lazy reduction (interleaved with input splat)
2536
2537         vpsrlq         \$52,$T0,$T2            # splat input
2538         vpsllq         \$12,$T4,$T3
2539
2540        vpsrlq          \$26,$D3,$H3
2541        vpandq          $MASK,$D3,$D3
2542        vpaddq          $H3,$D4,$H4             # h3 -> h4
2543
2544         vporq          $T3,$T2,$T2
2545
2546        vpsrlq          \$26,$H0,$D0
2547        vpandq          $MASK,$H0,$H0
2548        vpaddq          $D0,$H1,$H1             # h0 -> h1
2549
2550         vpandq         $MASK,$T2,$T2           # 2
2551
2552        vpsrlq          \$26,$H4,$D4
2553        vpandq          $MASK,$H4,$H4
2554
2555        vpsrlq          \$26,$H1,$D1
2556        vpandq          $MASK,$H1,$H1
2557        vpaddq          $D1,$H2,$H2             # h1 -> h2
2558
2559        vpaddq          $D4,$H0,$H0
2560        vpsllq          \$2,$D4,$D4
2561        vpaddq          $D4,$H0,$H0             # h4 -> h0
2562
2563         vpaddq         $T2,$H2,$H2             # modulo-scheduled
2564         vpsrlq         \$26,$T0,$T1
2565
2566        vpsrlq          \$26,$H2,$D2
2567        vpandq          $MASK,$H2,$H2
2568        vpaddq          $D2,$D3,$H3             # h2 -> h3
2569
2570         vpsrlq         \$14,$T4,$T3
2571
2572        vpsrlq          \$26,$H0,$D0
2573        vpandq          $MASK,$H0,$H0
2574        vpaddq          $D0,$H1,$H1             # h0 -> h1
2575
2576         vpsrlq         \$40,$T4,$T4            # 4
2577
2578        vpsrlq          \$26,$H3,$D3
2579        vpandq          $MASK,$H3,$H3
2580        vpaddq          $D3,$H4,$H4             # h3 -> h4
2581
2582         vpandq         $MASK,$T0,$T0           # 0
2583         #vpandq        $MASK,$T1,$T1           # 1
2584         #vpandq        $MASK,$T3,$T3           # 3
2585         #vporq         $PADBIT,$T4,$T4         # padbit, yes, always
2586
2587        sub             \$128,$len
2588        ja              .Loop_avx512
2589
2590.Ltail_avx512:
2591        ################################################################
2592        # while above multiplications were by r^8 in all lanes, in last
2593        # iteration we multiply least significant lane by r^8 and most
2594        # significant one by r, that's why table gets shifted...
2595
2596        vpsrlq          \$32,$R0,$R0            # 0105020603070408
2597        vpsrlq          \$32,$R1,$R1
2598        vpsrlq          \$32,$R2,$R2
2599        vpsrlq          \$32,$S3,$S3
2600        vpsrlq          \$32,$S4,$S4
2601        vpsrlq          \$32,$R3,$R3
2602        vpsrlq          \$32,$R4,$R4
2603        vpsrlq          \$32,$S1,$S1
2604        vpsrlq          \$32,$S2,$S2
2605
2606        ################################################################
2607        # load either next or last 64 byte of input
2608        lea             ($inp,$len),$inp
2609
2610        #vpaddq         $H2,$T2,$H2             # accumulate input
2611        vpaddq          $H0,$T0,$H0
2612
2613        vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
2614        vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
2615        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
2616         vpandq         $MASK,$T1,$T1           # 1
2617        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
2618         vpandq         $MASK,$T3,$T3           # 3
2619        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
2620         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2621         vpaddq         $H1,$T1,$H1             # accumulate input
2622         vpaddq         $H3,$T3,$H3
2623         vpaddq         $H4,$T4,$H4
2624
2625          vmovdqu       16*0($inp),%x#$T0
2626        vpmuludq        $H0,$R3,$M3
2627        vpmuludq        $H0,$R4,$M4
2628        vpmuludq        $H0,$R0,$M0
2629        vpmuludq        $H0,$R1,$M1
2630        vpaddq          $M3,$D3,$D3             # d3 += h0*r3
2631        vpaddq          $M4,$D4,$D4             # d4 += h0*r4
2632        vpaddq          $M0,$D0,$D0             # d0 += h0*r0
2633        vpaddq          $M1,$D1,$D1             # d1 += h0*r1
2634
2635          vmovdqu       16*1($inp),%x#$T1
2636        vpmuludq        $H1,$R2,$M3
2637        vpmuludq        $H1,$R3,$M4
2638        vpmuludq        $H1,$S4,$M0
2639        vpmuludq        $H0,$R2,$M2
2640        vpaddq          $M3,$D3,$D3             # d3 += h1*r2
2641        vpaddq          $M4,$D4,$D4             # d4 += h1*r3
2642        vpaddq          $M0,$D0,$D0             # d0 += h1*s4
2643        vpaddq          $M2,$D2,$D2             # d2 += h0*r2
2644
2645          vinserti128   \$1,16*2($inp),%y#$T0,%y#$T0
2646        vpmuludq        $H3,$R0,$M3
2647        vpmuludq        $H3,$R1,$M4
2648        vpmuludq        $H1,$R0,$M1
2649        vpmuludq        $H1,$R1,$M2
2650        vpaddq          $M3,$D3,$D3             # d3 += h3*r0
2651        vpaddq          $M4,$D4,$D4             # d4 += h3*r1
2652        vpaddq          $M1,$D1,$D1             # d1 += h1*r0
2653        vpaddq          $M2,$D2,$D2             # d2 += h1*r1
2654
2655          vinserti128   \$1,16*3($inp),%y#$T1,%y#$T1
2656        vpmuludq        $H4,$S4,$M3
2657        vpmuludq        $H4,$R0,$M4
2658        vpmuludq        $H3,$S2,$M0
2659        vpmuludq        $H3,$S3,$M1
2660        vpmuludq        $H3,$S4,$M2
2661        vpaddq          $M3,$D3,$H3             # h3 = d3 + h4*s4
2662        vpaddq          $M4,$D4,$D4             # d4 += h4*r0
2663        vpaddq          $M0,$D0,$D0             # d0 += h3*s2
2664        vpaddq          $M1,$D1,$D1             # d1 += h3*s3
2665        vpaddq          $M2,$D2,$D2             # d2 += h3*s4
2666
2667        vpmuludq        $H4,$S1,$M0
2668        vpmuludq        $H4,$S2,$M1
2669        vpmuludq        $H4,$S3,$M2
2670        vpaddq          $M0,$D0,$H0             # h0 = d0 + h4*s1
2671        vpaddq          $M1,$D1,$H1             # h1 = d2 + h4*s2
2672        vpaddq          $M2,$D2,$H2             # h2 = d3 + h4*s3
2673
2674        ################################################################
2675        # horizontal addition
2676
2677        mov             \$1,%eax
2678        vpermq          \$0xb1,$H3,$D3
2679        vpermq          \$0xb1,$D4,$H4
2680        vpermq          \$0xb1,$H0,$D0
2681        vpermq          \$0xb1,$H1,$D1
2682        vpermq          \$0xb1,$H2,$D2
2683        vpaddq          $D3,$H3,$H3
2684        vpaddq          $D4,$H4,$H4
2685        vpaddq          $D0,$H0,$H0
2686        vpaddq          $D1,$H1,$H1
2687        vpaddq          $D2,$H2,$H2
2688
2689        kmovw           %eax,%k3
2690        vpermq          \$0x2,$H3,$D3
2691        vpermq          \$0x2,$H4,$D4
2692        vpermq          \$0x2,$H0,$D0
2693        vpermq          \$0x2,$H1,$D1
2694        vpermq          \$0x2,$H2,$D2
2695        vpaddq          $D3,$H3,$H3
2696        vpaddq          $D4,$H4,$H4
2697        vpaddq          $D0,$H0,$H0
2698        vpaddq          $D1,$H1,$H1
2699        vpaddq          $D2,$H2,$H2
2700
2701        vextracti64x4   \$0x1,$H3,%y#$D3
2702        vextracti64x4   \$0x1,$H4,%y#$D4
2703        vextracti64x4   \$0x1,$H0,%y#$D0
2704        vextracti64x4   \$0x1,$H1,%y#$D1
2705        vextracti64x4   \$0x1,$H2,%y#$D2
2706        vpaddq          $D3,$H3,${H3}{%k3}{z}   # keep single qword in case
2707        vpaddq          $D4,$H4,${H4}{%k3}{z}   # it's passed to .Ltail_avx2
2708        vpaddq          $D0,$H0,${H0}{%k3}{z}
2709        vpaddq          $D1,$H1,${H1}{%k3}{z}
2710        vpaddq          $D2,$H2,${H2}{%k3}{z}
2711___
2712map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2713map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2714$code.=<<___;
2715        ################################################################
2716        # lazy reduction (interleaved with input splat)
2717
2718        vpsrlq          \$26,$H3,$D3
2719        vpand           $MASK,$H3,$H3
2720         vpsrldq        \$6,$T0,$T2             # splat input
2721         vpsrldq        \$6,$T1,$T3
2722         vpunpckhqdq    $T1,$T0,$T4             # 4
2723        vpaddq          $D3,$H4,$H4             # h3 -> h4
2724
2725        vpsrlq          \$26,$H0,$D0
2726        vpand           $MASK,$H0,$H0
2727         vpunpcklqdq    $T3,$T2,$T2             # 2:3
2728         vpunpcklqdq    $T1,$T0,$T0             # 0:1
2729        vpaddq          $D0,$H1,$H1             # h0 -> h1
2730
2731        vpsrlq          \$26,$H4,$D4
2732        vpand           $MASK,$H4,$H4
2733
2734        vpsrlq          \$26,$H1,$D1
2735        vpand           $MASK,$H1,$H1
2736         vpsrlq         \$30,$T2,$T3
2737         vpsrlq         \$4,$T2,$T2
2738        vpaddq          $D1,$H2,$H2             # h1 -> h2
2739
2740        vpaddq          $D4,$H0,$H0
2741        vpsllq          \$2,$D4,$D4
2742         vpsrlq         \$26,$T0,$T1
2743         vpsrlq         \$40,$T4,$T4            # 4
2744        vpaddq          $D4,$H0,$H0             # h4 -> h0
2745
2746        vpsrlq          \$26,$H2,$D2
2747        vpand           $MASK,$H2,$H2
2748         vpand          $MASK,$T2,$T2           # 2
2749         vpand          $MASK,$T0,$T0           # 0
2750        vpaddq          $D2,$H3,$H3             # h2 -> h3
2751
2752        vpsrlq          \$26,$H0,$D0
2753        vpand           $MASK,$H0,$H0
2754         vpaddq         $H2,$T2,$H2             # accumulate input for .Ltail_avx2
2755         vpand          $MASK,$T1,$T1           # 1
2756        vpaddq          $D0,$H1,$H1             # h0 -> h1
2757
2758        vpsrlq          \$26,$H3,$D3
2759        vpand           $MASK,$H3,$H3
2760         vpand          $MASK,$T3,$T3           # 3
2761         vpor           32(%rcx),$T4,$T4        # padbit, yes, always
2762        vpaddq          $D3,$H4,$H4             # h3 -> h4
2763
2764        lea             0x90(%rsp),%rax         # size optimization for .Ltail_avx2
2765        add             \$64,$len
2766        jnz             .Ltail_avx2$suffix
2767
2768        vpsubq          $T2,$H2,$H2             # undo input accumulation
2769        vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2770        vmovd           %x#$H1,`4*1-48-64`($ctx)
2771        vmovd           %x#$H2,`4*2-48-64`($ctx)
2772        vmovd           %x#$H3,`4*3-48-64`($ctx)
2773        vmovd           %x#$H4,`4*4-48-64`($ctx)
2774        vzeroall
2775___
2776$code.=<<___    if ($win64);
2777        movdqa          -0xb0(%r10),%xmm6
2778        movdqa          -0xa0(%r10),%xmm7
2779        movdqa          -0x90(%r10),%xmm8
2780        movdqa          -0x80(%r10),%xmm9
2781        movdqa          -0x70(%r10),%xmm10
2782        movdqa          -0x60(%r10),%xmm11
2783        movdqa          -0x50(%r10),%xmm12
2784        movdqa          -0x40(%r10),%xmm13
2785        movdqa          -0x30(%r10),%xmm14
2786        movdqa          -0x20(%r10),%xmm15
2787        lea             -8(%r10),%rsp
2788.Ldo_avx512_epilogue:
2789___
2790$code.=<<___    if (!$win64);
2791        lea             -8(%r10),%rsp
2792.cfi_def_cfa_register   %rsp
2793___
2794$code.=<<___;
2795        ret
2796.cfi_endproc
2797___
2798
2799}
2800
2801}
2802
2803&declare_function("poly1305_blocks_avx2", 32, 4);
2804poly1305_blocks_avxN(0);
2805&end_function("poly1305_blocks_avx2");
2806
2807#######################################################################
2808if ($avx>2) {
2809# On entry we have input length divisible by 64. But since inner loop
2810# processes 128 bytes per iteration, cases when length is not divisible
2811# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2812# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2813# for this tail, we wouldn't have to even allocate stack frame...
2814
2815if($kernel) {
2816        $code .= "#ifdef CONFIG_AS_AVX512\n";
2817}
2818
2819&declare_function("poly1305_blocks_avx512", 32, 4);
2820poly1305_blocks_avxN(1);
2821&end_function("poly1305_blocks_avx512");
2822
2823if ($kernel) {
2824        $code .= "#endif\n";
2825}
2826
2827if (!$kernel && $avx>3) {
2828########################################################################
2829# VPMADD52 version using 2^44 radix.
2830#
2831# One can argue that base 2^52 would be more natural. Well, even though
2832# some operations would be more natural, one has to recognize couple of
2833# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2834# at amount of multiply-n-accumulate operations. Secondly, it makes it
2835# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2836# reference implementations], which means that more such operations
2837# would have to be performed in inner loop, which in turn makes critical
2838# path longer. In other words, even though base 2^44 reduction might
2839# look less elegant, overall critical path is actually shorter...
2840
2841########################################################################
2842# Layout of opaque area is following.
2843#
2844#       unsigned __int64 h[3];          # current hash value base 2^44
2845#       unsigned __int64 s[2];          # key value*20 base 2^44
2846#       unsigned __int64 r[3];          # key value base 2^44
2847#       struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2848#                                       # r^n positions reflect
2849#                                       # placement in register, not
2850#                                       # memory, R[3] is R[1]*20
2851
2852$code.=<<___;
2853.type   poly1305_init_base2_44,\@function,3
2854.align  32
2855poly1305_init_base2_44:
2856        xor     %eax,%eax
2857        mov     %rax,0($ctx)            # initialize hash value
2858        mov     %rax,8($ctx)
2859        mov     %rax,16($ctx)
2860
2861.Linit_base2_44:
2862        lea     poly1305_blocks_vpmadd52(%rip),%r10
2863        lea     poly1305_emit_base2_44(%rip),%r11
2864
2865        mov     \$0x0ffffffc0fffffff,%rax
2866        mov     \$0x0ffffffc0ffffffc,%rcx
2867        and     0($inp),%rax
2868        mov     \$0x00000fffffffffff,%r8
2869        and     8($inp),%rcx
2870        mov     \$0x00000fffffffffff,%r9
2871        and     %rax,%r8
2872        shrd    \$44,%rcx,%rax
2873        mov     %r8,40($ctx)            # r0
2874        and     %r9,%rax
2875        shr     \$24,%rcx
2876        mov     %rax,48($ctx)           # r1
2877        lea     (%rax,%rax,4),%rax      # *5
2878        mov     %rcx,56($ctx)           # r2
2879        shl     \$2,%rax                # magic <<2
2880        lea     (%rcx,%rcx,4),%rcx      # *5
2881        shl     \$2,%rcx                # magic <<2
2882        mov     %rax,24($ctx)           # s1
2883        mov     %rcx,32($ctx)           # s2
2884        movq    \$-1,64($ctx)           # write impossible value
2885___
2886$code.=<<___    if ($flavour !~ /elf32/);
2887        mov     %r10,0(%rdx)
2888        mov     %r11,8(%rdx)
2889___
2890$code.=<<___    if ($flavour =~ /elf32/);
2891        mov     %r10d,0(%rdx)
2892        mov     %r11d,4(%rdx)
2893___
2894$code.=<<___;
2895        mov     \$1,%eax
2896        ret
2897.size   poly1305_init_base2_44,.-poly1305_init_base2_44
2898___
2899{
2900my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2901my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2902my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2903
2904$code.=<<___;
2905.type   poly1305_blocks_vpmadd52,\@function,4
2906.align  32
2907poly1305_blocks_vpmadd52:
2908        shr     \$4,$len
2909        jz      .Lno_data_vpmadd52              # too short
2910
2911        shl     \$40,$padbit
2912        mov     64($ctx),%r8                    # peek on power of the key
2913
2914        # if powers of the key are not calculated yet, process up to 3
2915        # blocks with this single-block subroutine, otherwise ensure that
2916        # length is divisible by 2 blocks and pass the rest down to next
2917        # subroutine...
2918
2919        mov     \$3,%rax
2920        mov     \$1,%r10
2921        cmp     \$4,$len                        # is input long
2922        cmovae  %r10,%rax
2923        test    %r8,%r8                         # is power value impossible?
2924        cmovns  %r10,%rax
2925
2926        and     $len,%rax                       # is input of favourable length?
2927        jz      .Lblocks_vpmadd52_4x
2928
2929        sub             %rax,$len
2930        mov             \$7,%r10d
2931        mov             \$1,%r11d
2932        kmovw           %r10d,%k7
2933        lea             .L2_44_inp_permd(%rip),%r10
2934        kmovw           %r11d,%k1
2935
2936        vmovq           $padbit,%x#$PAD
2937        vmovdqa64       0(%r10),$inp_permd      # .L2_44_inp_permd
2938        vmovdqa64       32(%r10),$inp_shift     # .L2_44_inp_shift
2939        vpermq          \$0xcf,$PAD,$PAD
2940        vmovdqa64       64(%r10),$reduc_mask    # .L2_44_mask
2941
2942        vmovdqu64       0($ctx),${Dlo}{%k7}{z}          # load hash value
2943        vmovdqu64       40($ctx),${r2r1r0}{%k7}{z}      # load keys
2944        vmovdqu64       32($ctx),${r1r0s2}{%k7}{z}
2945        vmovdqu64       24($ctx),${r0s2s1}{%k7}{z}
2946
2947        vmovdqa64       96(%r10),$reduc_rght    # .L2_44_shift_rgt
2948        vmovdqa64       128(%r10),$reduc_left   # .L2_44_shift_lft
2949
2950        jmp             .Loop_vpmadd52
2951
2952.align  32
2953.Loop_vpmadd52:
2954        vmovdqu32       0($inp),%x#$T0          # load input as ----3210
2955        lea             16($inp),$inp
2956
2957        vpermd          $T0,$inp_permd,$T0      # ----3210 -> --322110
2958        vpsrlvq         $inp_shift,$T0,$T0
2959        vpandq          $reduc_mask,$T0,$T0
2960        vporq           $PAD,$T0,$T0
2961
2962        vpaddq          $T0,$Dlo,$Dlo           # accumulate input
2963
2964        vpermq          \$0,$Dlo,${H0}{%k7}{z}  # smash hash value
2965        vpermq          \$0b01010101,$Dlo,${H1}{%k7}{z}
2966        vpermq          \$0b10101010,$Dlo,${H2}{%k7}{z}
2967
2968        vpxord          $Dlo,$Dlo,$Dlo
2969        vpxord          $Dhi,$Dhi,$Dhi
2970
2971        vpmadd52luq     $r2r1r0,$H0,$Dlo
2972        vpmadd52huq     $r2r1r0,$H0,$Dhi
2973
2974        vpmadd52luq     $r1r0s2,$H1,$Dlo
2975        vpmadd52huq     $r1r0s2,$H1,$Dhi
2976
2977        vpmadd52luq     $r0s2s1,$H2,$Dlo
2978        vpmadd52huq     $r0s2s1,$H2,$Dhi
2979
2980        vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost qword
2981        vpsllvq         $reduc_left,$Dhi,$Dhi   # 0 in topmost qword
2982        vpandq          $reduc_mask,$Dlo,$Dlo
2983
2984        vpaddq          $T0,$Dhi,$Dhi
2985
2986        vpermq          \$0b10010011,$Dhi,$Dhi  # 0 in lowest qword
2987
2988        vpaddq          $Dhi,$Dlo,$Dlo          # note topmost qword :-)
2989
2990        vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost word
2991        vpandq          $reduc_mask,$Dlo,$Dlo
2992
2993        vpermq          \$0b10010011,$T0,$T0
2994
2995        vpaddq          $T0,$Dlo,$Dlo
2996
2997        vpermq          \$0b10010011,$Dlo,${T0}{%k1}{z}
2998
2999        vpaddq          $T0,$Dlo,$Dlo
3000        vpsllq          \$2,$T0,$T0
3001
3002        vpaddq          $T0,$Dlo,$Dlo
3003
3004        dec             %rax                    # len-=16
3005        jnz             .Loop_vpmadd52
3006
3007        vmovdqu64       $Dlo,0($ctx){%k7}       # store hash value
3008
3009        test            $len,$len
3010        jnz             .Lblocks_vpmadd52_4x
3011
3012.Lno_data_vpmadd52:
3013        ret
3014.size   poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
3015___
3016}
3017{
3018########################################################################
3019# As implied by its name 4x subroutine processes 4 blocks in parallel
3020# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
3021# and is handled in 256-bit %ymm registers.
3022
3023my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3024my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3025my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3026
3027$code.=<<___;
3028.type   poly1305_blocks_vpmadd52_4x,\@function,4
3029.align  32
3030poly1305_blocks_vpmadd52_4x:
3031        shr     \$4,$len
3032        jz      .Lno_data_vpmadd52_4x           # too short
3033
3034        shl     \$40,$padbit
3035        mov     64($ctx),%r8                    # peek on power of the key
3036
3037.Lblocks_vpmadd52_4x:
3038        vpbroadcastq    $padbit,$PAD
3039
3040        vmovdqa64       .Lx_mask44(%rip),$mask44
3041        mov             \$5,%eax
3042        vmovdqa64       .Lx_mask42(%rip),$mask42
3043        kmovw           %eax,%k1                # used in 2x path
3044
3045        test            %r8,%r8                 # is power value impossible?
3046        js              .Linit_vpmadd52         # if it is, then init R[4]
3047
3048        vmovq           0($ctx),%x#$H0          # load current hash value
3049        vmovq           8($ctx),%x#$H1
3050        vmovq           16($ctx),%x#$H2
3051
3052        test            \$3,$len                # is length 4*n+2?
3053        jnz             .Lblocks_vpmadd52_2x_do
3054
3055.Lblocks_vpmadd52_4x_do:
3056        vpbroadcastq    64($ctx),$R0            # load 4th power of the key
3057        vpbroadcastq    96($ctx),$R1
3058        vpbroadcastq    128($ctx),$R2
3059        vpbroadcastq    160($ctx),$S1
3060
3061.Lblocks_vpmadd52_4x_key_loaded:
3062        vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
3063        vpaddq          $R2,$S2,$S2
3064        vpsllq          \$2,$S2,$S2
3065
3066        test            \$7,$len                # is len 8*n?
3067        jz              .Lblocks_vpmadd52_8x
3068
3069        vmovdqu64       16*0($inp),$T2          # load data
3070        vmovdqu64       16*2($inp),$T3
3071        lea             16*4($inp),$inp
3072
3073        vpunpcklqdq     $T3,$T2,$T1             # transpose data
3074        vpunpckhqdq     $T3,$T2,$T3
3075
3076        # at this point 64-bit lanes are ordered as 3-1-2-0
3077
3078        vpsrlq          \$24,$T3,$T2            # splat the data
3079        vporq           $PAD,$T2,$T2
3080         vpaddq         $T2,$H2,$H2             # accumulate input
3081        vpandq          $mask44,$T1,$T0
3082        vpsrlq          \$44,$T1,$T1
3083        vpsllq          \$20,$T3,$T3
3084        vporq           $T3,$T1,$T1
3085        vpandq          $mask44,$T1,$T1
3086
3087        sub             \$4,$len
3088        jz              .Ltail_vpmadd52_4x
3089        jmp             .Loop_vpmadd52_4x
3090        ud2
3091
3092.align  32
3093.Linit_vpmadd52:
3094        vmovq           24($ctx),%x#$S1         # load key
3095        vmovq           56($ctx),%x#$H2
3096        vmovq           32($ctx),%x#$S2
3097        vmovq           40($ctx),%x#$R0
3098        vmovq           48($ctx),%x#$R1
3099
3100        vmovdqa         $R0,$H0
3101        vmovdqa         $R1,$H1
3102        vmovdqa         $H2,$R2
3103
3104        mov             \$2,%eax
3105
3106.Lmul_init_vpmadd52:
3107        vpxorq          $D0lo,$D0lo,$D0lo
3108        vpmadd52luq     $H2,$S1,$D0lo
3109        vpxorq          $D0hi,$D0hi,$D0hi
3110        vpmadd52huq     $H2,$S1,$D0hi
3111        vpxorq          $D1lo,$D1lo,$D1lo
3112        vpmadd52luq     $H2,$S2,$D1lo
3113        vpxorq          $D1hi,$D1hi,$D1hi
3114        vpmadd52huq     $H2,$S2,$D1hi
3115        vpxorq          $D2lo,$D2lo,$D2lo
3116        vpmadd52luq     $H2,$R0,$D2lo
3117        vpxorq          $D2hi,$D2hi,$D2hi
3118        vpmadd52huq     $H2,$R0,$D2hi
3119
3120        vpmadd52luq     $H0,$R0,$D0lo
3121        vpmadd52huq     $H0,$R0,$D0hi
3122        vpmadd52luq     $H0,$R1,$D1lo
3123        vpmadd52huq     $H0,$R1,$D1hi
3124        vpmadd52luq     $H0,$R2,$D2lo
3125        vpmadd52huq     $H0,$R2,$D2hi
3126
3127        vpmadd52luq     $H1,$S2,$D0lo
3128        vpmadd52huq     $H1,$S2,$D0hi
3129        vpmadd52luq     $H1,$R0,$D1lo
3130        vpmadd52huq     $H1,$R0,$D1hi
3131        vpmadd52luq     $H1,$R1,$D2lo
3132        vpmadd52huq     $H1,$R1,$D2hi
3133
3134        ################################################################
3135        # partial reduction
3136        vpsrlq          \$44,$D0lo,$tmp
3137        vpsllq          \$8,$D0hi,$D0hi
3138        vpandq          $mask44,$D0lo,$H0
3139        vpaddq          $tmp,$D0hi,$D0hi
3140
3141        vpaddq          $D0hi,$D1lo,$D1lo
3142
3143        vpsrlq          \$44,$D1lo,$tmp
3144        vpsllq          \$8,$D1hi,$D1hi
3145        vpandq          $mask44,$D1lo,$H1
3146        vpaddq          $tmp,$D1hi,$D1hi
3147
3148        vpaddq          $D1hi,$D2lo,$D2lo
3149
3150        vpsrlq          \$42,$D2lo,$tmp
3151        vpsllq          \$10,$D2hi,$D2hi
3152        vpandq          $mask42,$D2lo,$H2
3153        vpaddq          $tmp,$D2hi,$D2hi
3154
3155        vpaddq          $D2hi,$H0,$H0
3156        vpsllq          \$2,$D2hi,$D2hi
3157
3158        vpaddq          $D2hi,$H0,$H0
3159
3160        vpsrlq          \$44,$H0,$tmp           # additional step
3161        vpandq          $mask44,$H0,$H0
3162
3163        vpaddq          $tmp,$H1,$H1
3164
3165        dec             %eax
3166        jz              .Ldone_init_vpmadd52
3167
3168        vpunpcklqdq     $R1,$H1,$R1             # 1,2
3169        vpbroadcastq    %x#$H1,%x#$H1           # 2,2
3170        vpunpcklqdq     $R2,$H2,$R2
3171        vpbroadcastq    %x#$H2,%x#$H2
3172        vpunpcklqdq     $R0,$H0,$R0
3173        vpbroadcastq    %x#$H0,%x#$H0
3174
3175        vpsllq          \$2,$R1,$S1             # S1 = R1*5*4
3176        vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
3177        vpaddq          $R1,$S1,$S1
3178        vpaddq          $R2,$S2,$S2
3179        vpsllq          \$2,$S1,$S1
3180        vpsllq          \$2,$S2,$S2
3181
3182        jmp             .Lmul_init_vpmadd52
3183        ud2
3184
3185.align  32
3186.Ldone_init_vpmadd52:
3187        vinserti128     \$1,%x#$R1,$H1,$R1      # 1,2,3,4
3188        vinserti128     \$1,%x#$R2,$H2,$R2
3189        vinserti128     \$1,%x#$R0,$H0,$R0
3190
3191        vpermq          \$0b11011000,$R1,$R1    # 1,3,2,4
3192        vpermq          \$0b11011000,$R2,$R2
3193        vpermq          \$0b11011000,$R0,$R0
3194
3195        vpsllq          \$2,$R1,$S1             # S1 = R1*5*4
3196        vpaddq          $R1,$S1,$S1
3197        vpsllq          \$2,$S1,$S1
3198
3199        vmovq           0($ctx),%x#$H0          # load current hash value
3200        vmovq           8($ctx),%x#$H1
3201        vmovq           16($ctx),%x#$H2
3202
3203        test            \$3,$len                # is length 4*n+2?
3204        jnz             .Ldone_init_vpmadd52_2x
3205
3206        vmovdqu64       $R0,64($ctx)            # save key powers
3207        vpbroadcastq    %x#$R0,$R0              # broadcast 4th power
3208        vmovdqu64       $R1,96($ctx)
3209        vpbroadcastq    %x#$R1,$R1
3210        vmovdqu64       $R2,128($ctx)
3211        vpbroadcastq    %x#$R2,$R2
3212        vmovdqu64       $S1,160($ctx)
3213        vpbroadcastq    %x#$S1,$S1
3214
3215        jmp             .Lblocks_vpmadd52_4x_key_loaded
3216        ud2
3217
3218.align  32
3219.Ldone_init_vpmadd52_2x:
3220        vmovdqu64       $R0,64($ctx)            # save key powers
3221        vpsrldq         \$8,$R0,$R0             # 0-1-0-2
3222        vmovdqu64       $R1,96($ctx)
3223        vpsrldq         \$8,$R1,$R1
3224        vmovdqu64       $R2,128($ctx)
3225        vpsrldq         \$8,$R2,$R2
3226        vmovdqu64       $S1,160($ctx)
3227        vpsrldq         \$8,$S1,$S1
3228        jmp             .Lblocks_vpmadd52_2x_key_loaded
3229        ud2
3230
3231.align  32
3232.Lblocks_vpmadd52_2x_do:
3233        vmovdqu64       128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3234        vmovdqu64       160+8($ctx),${S1}{%k1}{z}
3235        vmovdqu64       64+8($ctx),${R0}{%k1}{z}
3236        vmovdqu64       96+8($ctx),${R1}{%k1}{z}
3237
3238.Lblocks_vpmadd52_2x_key_loaded:
3239        vmovdqu64       16*0($inp),$T2          # load data
3240        vpxorq          $T3,$T3,$T3
3241        lea             16*2($inp),$inp
3242
3243        vpunpcklqdq     $T3,$T2,$T1             # transpose data
3244        vpunpckhqdq     $T3,$T2,$T3
3245
3246        # at this point 64-bit lanes are ordered as x-1-x-0
3247
3248        vpsrlq          \$24,$T3,$T2            # splat the data
3249        vporq           $PAD,$T2,$T2
3250         vpaddq         $T2,$H2,$H2             # accumulate input
3251        vpandq          $mask44,$T1,$T0
3252        vpsrlq          \$44,$T1,$T1
3253        vpsllq          \$20,$T3,$T3
3254        vporq           $T3,$T1,$T1
3255        vpandq          $mask44,$T1,$T1
3256
3257        jmp             .Ltail_vpmadd52_2x
3258        ud2
3259
3260.align  32
3261.Loop_vpmadd52_4x:
3262        #vpaddq         $T2,$H2,$H2             # accumulate input
3263        vpaddq          $T0,$H0,$H0
3264        vpaddq          $T1,$H1,$H1
3265
3266        vpxorq          $D0lo,$D0lo,$D0lo
3267        vpmadd52luq     $H2,$S1,$D0lo
3268        vpxorq          $D0hi,$D0hi,$D0hi
3269        vpmadd52huq     $H2,$S1,$D0hi
3270        vpxorq          $D1lo,$D1lo,$D1lo
3271        vpmadd52luq     $H2,$S2,$D1lo
3272        vpxorq          $D1hi,$D1hi,$D1hi
3273        vpmadd52huq     $H2,$S2,$D1hi
3274        vpxorq          $D2lo,$D2lo,$D2lo
3275        vpmadd52luq     $H2,$R0,$D2lo
3276        vpxorq          $D2hi,$D2hi,$D2hi
3277        vpmadd52huq     $H2,$R0,$D2hi
3278
3279         vmovdqu64      16*0($inp),$T2          # load data
3280         vmovdqu64      16*2($inp),$T3
3281         lea            16*4($inp),$inp
3282        vpmadd52luq     $H0,$R0,$D0lo
3283        vpmadd52huq     $H0,$R0,$D0hi
3284        vpmadd52luq     $H0,$R1,$D1lo
3285        vpmadd52huq     $H0,$R1,$D1hi
3286        vpmadd52luq     $H0,$R2,$D2lo
3287        vpmadd52huq     $H0,$R2,$D2hi
3288
3289         vpunpcklqdq    $T3,$T2,$T1             # transpose data
3290         vpunpckhqdq    $T3,$T2,$T3
3291        vpmadd52luq     $H1,$S2,$D0lo
3292        vpmadd52huq     $H1,$S2,$D0hi
3293        vpmadd52luq     $H1,$R0,$D1lo
3294        vpmadd52huq     $H1,$R0,$D1hi
3295        vpmadd52luq     $H1,$R1,$D2lo
3296        vpmadd52huq     $H1,$R1,$D2hi
3297
3298        ################################################################
3299        # partial reduction (interleaved with data splat)
3300        vpsrlq          \$44,$D0lo,$tmp
3301        vpsllq          \$8,$D0hi,$D0hi
3302        vpandq          $mask44,$D0lo,$H0
3303        vpaddq          $tmp,$D0hi,$D0hi
3304
3305         vpsrlq         \$24,$T3,$T2
3306         vporq          $PAD,$T2,$T2
3307        vpaddq          $D0hi,$D1lo,$D1lo
3308
3309        vpsrlq          \$44,$D1lo,$tmp
3310        vpsllq          \$8,$D1hi,$D1hi
3311        vpandq          $mask44,$D1lo,$H1
3312        vpaddq          $tmp,$D1hi,$D1hi
3313
3314         vpandq         $mask44,$T1,$T0
3315         vpsrlq         \$44,$T1,$T1
3316         vpsllq         \$20,$T3,$T3
3317        vpaddq          $D1hi,$D2lo,$D2lo
3318
3319        vpsrlq          \$42,$D2lo,$tmp
3320        vpsllq          \$10,$D2hi,$D2hi
3321        vpandq          $mask42,$D2lo,$H2
3322        vpaddq          $tmp,$D2hi,$D2hi
3323
3324          vpaddq        $T2,$H2,$H2             # accumulate input
3325        vpaddq          $D2hi,$H0,$H0
3326        vpsllq          \$2,$D2hi,$D2hi
3327
3328        vpaddq          $D2hi,$H0,$H0
3329         vporq          $T3,$T1,$T1
3330         vpandq         $mask44,$T1,$T1
3331
3332        vpsrlq          \$44,$H0,$tmp           # additional step
3333        vpandq          $mask44,$H0,$H0
3334
3335        vpaddq          $tmp,$H1,$H1
3336
3337        sub             \$4,$len                # len-=64
3338        jnz             .Loop_vpmadd52_4x
3339
3340.Ltail_vpmadd52_4x:
3341        vmovdqu64       128($ctx),$R2           # load all key powers
3342        vmovdqu64       160($ctx),$S1
3343        vmovdqu64       64($ctx),$R0
3344        vmovdqu64       96($ctx),$R1
3345
3346.Ltail_vpmadd52_2x:
3347        vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
3348        vpaddq          $R2,$S2,$S2
3349        vpsllq          \$2,$S2,$S2
3350
3351        #vpaddq         $T2,$H2,$H2             # accumulate input
3352        vpaddq          $T0,$H0,$H0
3353        vpaddq          $T1,$H1,$H1
3354
3355        vpxorq          $D0lo,$D0lo,$D0lo
3356        vpmadd52luq     $H2,$S1,$D0lo
3357        vpxorq          $D0hi,$D0hi,$D0hi
3358        vpmadd52huq     $H2,$S1,$D0hi
3359        vpxorq          $D1lo,$D1lo,$D1lo
3360        vpmadd52luq     $H2,$S2,$D1lo
3361        vpxorq          $D1hi,$D1hi,$D1hi
3362        vpmadd52huq     $H2,$S2,$D1hi
3363        vpxorq          $D2lo,$D2lo,$D2lo
3364        vpmadd52luq     $H2,$R0,$D2lo
3365        vpxorq          $D2hi,$D2hi,$D2hi
3366        vpmadd52huq     $H2,$R0,$D2hi
3367
3368        vpmadd52luq     $H0,$R0,$D0lo
3369        vpmadd52huq     $H0,$R0,$D0hi
3370        vpmadd52luq     $H0,$R1,$D1lo
3371        vpmadd52huq     $H0,$R1,$D1hi
3372        vpmadd52luq     $H0,$R2,$D2lo
3373        vpmadd52huq     $H0,$R2,$D2hi
3374
3375        vpmadd52luq     $H1,$S2,$D0lo
3376        vpmadd52huq     $H1,$S2,$D0hi
3377        vpmadd52luq     $H1,$R0,$D1lo
3378        vpmadd52huq     $H1,$R0,$D1hi
3379        vpmadd52luq     $H1,$R1,$D2lo
3380        vpmadd52huq     $H1,$R1,$D2hi
3381
3382        ################################################################
3383        # horizontal addition
3384
3385        mov             \$1,%eax
3386        kmovw           %eax,%k1
3387        vpsrldq         \$8,$D0lo,$T0
3388        vpsrldq         \$8,$D0hi,$H0
3389        vpsrldq         \$8,$D1lo,$T1
3390        vpsrldq         \$8,$D1hi,$H1
3391        vpaddq          $T0,$D0lo,$D0lo
3392        vpaddq          $H0,$D0hi,$D0hi
3393        vpsrldq         \$8,$D2lo,$T2
3394        vpsrldq         \$8,$D2hi,$H2
3395        vpaddq          $T1,$D1lo,$D1lo
3396        vpaddq          $H1,$D1hi,$D1hi
3397         vpermq         \$0x2,$D0lo,$T0
3398         vpermq         \$0x2,$D0hi,$H0
3399        vpaddq          $T2,$D2lo,$D2lo
3400        vpaddq          $H2,$D2hi,$D2hi
3401
3402        vpermq          \$0x2,$D1lo,$T1
3403        vpermq          \$0x2,$D1hi,$H1
3404        vpaddq          $T0,$D0lo,${D0lo}{%k1}{z}
3405        vpaddq          $H0,$D0hi,${D0hi}{%k1}{z}
3406        vpermq          \$0x2,$D2lo,$T2
3407        vpermq          \$0x2,$D2hi,$H2
3408        vpaddq          $T1,$D1lo,${D1lo}{%k1}{z}
3409        vpaddq          $H1,$D1hi,${D1hi}{%k1}{z}
3410        vpaddq          $T2,$D2lo,${D2lo}{%k1}{z}
3411        vpaddq          $H2,$D2hi,${D2hi}{%k1}{z}
3412
3413        ################################################################
3414        # partial reduction
3415        vpsrlq          \$44,$D0lo,$tmp
3416        vpsllq          \$8,$D0hi,$D0hi
3417        vpandq          $mask44,$D0lo,$H0
3418        vpaddq          $tmp,$D0hi,$D0hi
3419
3420        vpaddq          $D0hi,$D1lo,$D1lo
3421
3422        vpsrlq          \$44,$D1lo,$tmp
3423        vpsllq          \$8,$D1hi,$D1hi
3424        vpandq          $mask44,$D1lo,$H1
3425        vpaddq          $tmp,$D1hi,$D1hi
3426
3427        vpaddq          $D1hi,$D2lo,$D2lo
3428
3429        vpsrlq          \$42,$D2lo,$tmp
3430        vpsllq          \$10,$D2hi,$D2hi
3431        vpandq          $mask42,$D2lo,$H2
3432        vpaddq          $tmp,$D2hi,$D2hi
3433
3434        vpaddq          $D2hi,$H0,$H0
3435        vpsllq          \$2,$D2hi,$D2hi
3436
3437        vpaddq          $D2hi,$H0,$H0
3438
3439        vpsrlq          \$44,$H0,$tmp           # additional step
3440        vpandq          $mask44,$H0,$H0
3441
3442        vpaddq          $tmp,$H1,$H1
3443                                                # at this point $len is
3444                                                # either 4*n+2 or 0...
3445        sub             \$2,$len                # len-=32
3446        ja              .Lblocks_vpmadd52_4x_do
3447
3448        vmovq           %x#$H0,0($ctx)
3449        vmovq           %x#$H1,8($ctx)
3450        vmovq           %x#$H2,16($ctx)
3451        vzeroall
3452
3453.Lno_data_vpmadd52_4x:
3454        ret
3455.size   poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3456___
3457}
3458{
3459########################################################################
3460# As implied by its name 8x subroutine processes 8 blocks in parallel...
3461# This is intermediate version, as it's used only in cases when input
3462# length is either 8*n, 8*n+1 or 8*n+2...
3463
3464my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3465my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3466my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3467my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3468
3469$code.=<<___;
3470.type   poly1305_blocks_vpmadd52_8x,\@function,4
3471.align  32
3472poly1305_blocks_vpmadd52_8x:
3473        shr     \$4,$len
3474        jz      .Lno_data_vpmadd52_8x           # too short
3475
3476        shl     \$40,$padbit
3477        mov     64($ctx),%r8                    # peek on power of the key
3478
3479        vmovdqa64       .Lx_mask44(%rip),$mask44
3480        vmovdqa64       .Lx_mask42(%rip),$mask42
3481
3482        test    %r8,%r8                         # is power value impossible?
3483        js      .Linit_vpmadd52                 # if it is, then init R[4]
3484
3485        vmovq   0($ctx),%x#$H0                  # load current hash value
3486        vmovq   8($ctx),%x#$H1
3487        vmovq   16($ctx),%x#$H2
3488
3489.Lblocks_vpmadd52_8x:
3490        ################################################################
3491        # fist we calculate more key powers
3492
3493        vmovdqu64       128($ctx),$R2           # load 1-3-2-4 powers
3494        vmovdqu64       160($ctx),$S1
3495        vmovdqu64       64($ctx),$R0
3496        vmovdqu64       96($ctx),$R1
3497
3498        vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
3499        vpaddq          $R2,$S2,$S2
3500        vpsllq          \$2,$S2,$S2
3501
3502        vpbroadcastq    %x#$R2,$RR2             # broadcast 4th power
3503        vpbroadcastq    %x#$R0,$RR0
3504        vpbroadcastq    %x#$R1,$RR1
3505
3506        vpxorq          $D0lo,$D0lo,$D0lo
3507        vpmadd52luq     $RR2,$S1,$D0lo
3508        vpxorq          $D0hi,$D0hi,$D0hi
3509        vpmadd52huq     $RR2,$S1,$D0hi
3510        vpxorq          $D1lo,$D1lo,$D1lo
3511        vpmadd52luq     $RR2,$S2,$D1lo
3512        vpxorq          $D1hi,$D1hi,$D1hi
3513        vpmadd52huq     $RR2,$S2,$D1hi
3514        vpxorq          $D2lo,$D2lo,$D2lo
3515        vpmadd52luq     $RR2,$R0,$D2lo
3516        vpxorq          $D2hi,$D2hi,$D2hi
3517        vpmadd52huq     $RR2,$R0,$D2hi
3518
3519        vpmadd52luq     $RR0,$R0,$D0lo
3520        vpmadd52huq     $RR0,$R0,$D0hi
3521        vpmadd52luq     $RR0,$R1,$D1lo
3522        vpmadd52huq     $RR0,$R1,$D1hi
3523        vpmadd52luq     $RR0,$R2,$D2lo
3524        vpmadd52huq     $RR0,$R2,$D2hi
3525
3526        vpmadd52luq     $RR1,$S2,$D0lo
3527        vpmadd52huq     $RR1,$S2,$D0hi
3528        vpmadd52luq     $RR1,$R0,$D1lo
3529        vpmadd52huq     $RR1,$R0,$D1hi
3530        vpmadd52luq     $RR1,$R1,$D2lo
3531        vpmadd52huq     $RR1,$R1,$D2hi
3532
3533        ################################################################
3534        # partial reduction
3535        vpsrlq          \$44,$D0lo,$tmp
3536        vpsllq          \$8,$D0hi,$D0hi
3537        vpandq          $mask44,$D0lo,$RR0
3538        vpaddq          $tmp,$D0hi,$D0hi
3539
3540        vpaddq          $D0hi,$D1lo,$D1lo
3541
3542        vpsrlq          \$44,$D1lo,$tmp
3543        vpsllq          \$8,$D1hi,$D1hi
3544        vpandq          $mask44,$D1lo,$RR1
3545        vpaddq          $tmp,$D1hi,$D1hi
3546
3547        vpaddq          $D1hi,$D2lo,$D2lo
3548
3549        vpsrlq          \$42,$D2lo,$tmp
3550        vpsllq          \$10,$D2hi,$D2hi
3551        vpandq          $mask42,$D2lo,$RR2
3552        vpaddq          $tmp,$D2hi,$D2hi
3553
3554        vpaddq          $D2hi,$RR0,$RR0
3555        vpsllq          \$2,$D2hi,$D2hi
3556
3557        vpaddq          $D2hi,$RR0,$RR0
3558
3559        vpsrlq          \$44,$RR0,$tmp          # additional step
3560        vpandq          $mask44,$RR0,$RR0
3561
3562        vpaddq          $tmp,$RR1,$RR1
3563
3564        ################################################################
3565        # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3566        # is 15263748, which reflects how data is loaded...
3567
3568        vpunpcklqdq     $R2,$RR2,$T2            # 3748
3569        vpunpckhqdq     $R2,$RR2,$R2            # 1526
3570        vpunpcklqdq     $R0,$RR0,$T0
3571        vpunpckhqdq     $R0,$RR0,$R0
3572        vpunpcklqdq     $R1,$RR1,$T1
3573        vpunpckhqdq     $R1,$RR1,$R1
3574___
3575######## switch to %zmm
3576map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3577map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3578map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3579map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3580
3581$code.=<<___;
3582        vshufi64x2      \$0x44,$R2,$T2,$RR2     # 15263748
3583        vshufi64x2      \$0x44,$R0,$T0,$RR0
3584        vshufi64x2      \$0x44,$R1,$T1,$RR1
3585
3586        vmovdqu64       16*0($inp),$T2          # load data
3587        vmovdqu64       16*4($inp),$T3
3588        lea             16*8($inp),$inp
3589
3590        vpsllq          \$2,$RR2,$SS2           # S2 = R2*5*4
3591        vpsllq          \$2,$RR1,$SS1           # S1 = R1*5*4
3592        vpaddq          $RR2,$SS2,$SS2
3593        vpaddq          $RR1,$SS1,$SS1
3594        vpsllq          \$2,$SS2,$SS2
3595        vpsllq          \$2,$SS1,$SS1
3596
3597        vpbroadcastq    $padbit,$PAD
3598        vpbroadcastq    %x#$mask44,$mask44
3599        vpbroadcastq    %x#$mask42,$mask42
3600
3601        vpbroadcastq    %x#$SS1,$S1             # broadcast 8th power
3602        vpbroadcastq    %x#$SS2,$S2
3603        vpbroadcastq    %x#$RR0,$R0
3604        vpbroadcastq    %x#$RR1,$R1
3605        vpbroadcastq    %x#$RR2,$R2
3606
3607        vpunpcklqdq     $T3,$T2,$T1             # transpose data
3608        vpunpckhqdq     $T3,$T2,$T3
3609
3610        # at this point 64-bit lanes are ordered as 73625140
3611
3612        vpsrlq          \$24,$T3,$T2            # splat the data
3613        vporq           $PAD,$T2,$T2
3614         vpaddq         $T2,$H2,$H2             # accumulate input
3615        vpandq          $mask44,$T1,$T0
3616        vpsrlq          \$44,$T1,$T1
3617        vpsllq          \$20,$T3,$T3
3618        vporq           $T3,$T1,$T1
3619        vpandq          $mask44,$T1,$T1
3620
3621        sub             \$8,$len
3622        jz              .Ltail_vpmadd52_8x
3623        jmp             .Loop_vpmadd52_8x
3624
3625.align  32
3626.Loop_vpmadd52_8x:
3627        #vpaddq         $T2,$H2,$H2             # accumulate input
3628        vpaddq          $T0,$H0,$H0
3629        vpaddq          $T1,$H1,$H1
3630
3631        vpxorq          $D0lo,$D0lo,$D0lo
3632        vpmadd52luq     $H2,$S1,$D0lo
3633        vpxorq          $D0hi,$D0hi,$D0hi
3634        vpmadd52huq     $H2,$S1,$D0hi
3635        vpxorq          $D1lo,$D1lo,$D1lo
3636        vpmadd52luq     $H2,$S2,$D1lo
3637        vpxorq          $D1hi,$D1hi,$D1hi
3638        vpmadd52huq     $H2,$S2,$D1hi
3639        vpxorq          $D2lo,$D2lo,$D2lo
3640        vpmadd52luq     $H2,$R0,$D2lo
3641        vpxorq          $D2hi,$D2hi,$D2hi
3642        vpmadd52huq     $H2,$R0,$D2hi
3643
3644         vmovdqu64      16*0($inp),$T2          # load data
3645         vmovdqu64      16*4($inp),$T3
3646         lea            16*8($inp),$inp
3647        vpmadd52luq     $H0,$R0,$D0lo
3648        vpmadd52huq     $H0,$R0,$D0hi
3649        vpmadd52luq     $H0,$R1,$D1lo
3650        vpmadd52huq     $H0,$R1,$D1hi
3651        vpmadd52luq     $H0,$R2,$D2lo
3652        vpmadd52huq     $H0,$R2,$D2hi
3653
3654         vpunpcklqdq    $T3,$T2,$T1             # transpose data
3655         vpunpckhqdq    $T3,$T2,$T3
3656        vpmadd52luq     $H1,$S2,$D0lo
3657        vpmadd52huq     $H1,$S2,$D0hi
3658        vpmadd52luq     $H1,$R0,$D1lo
3659        vpmadd52huq     $H1,$R0,$D1hi
3660        vpmadd52luq     $H1,$R1,$D2lo
3661        vpmadd52huq     $H1,$R1,$D2hi
3662
3663        ################################################################
3664        # partial reduction (interleaved with data splat)
3665        vpsrlq          \$44,$D0lo,$tmp
3666        vpsllq          \$8,$D0hi,$D0hi
3667        vpandq          $mask44,$D0lo,$H0
3668        vpaddq          $tmp,$D0hi,$D0hi
3669
3670         vpsrlq         \$24,$T3,$T2
3671         vporq          $PAD,$T2,$T2
3672        vpaddq          $D0hi,$D1lo,$D1lo
3673
3674        vpsrlq          \$44,$D1lo,$tmp
3675        vpsllq          \$8,$D1hi,$D1hi
3676        vpandq          $mask44,$D1lo,$H1
3677        vpaddq          $tmp,$D1hi,$D1hi
3678
3679         vpandq         $mask44,$T1,$T0
3680         vpsrlq         \$44,$T1,$T1
3681         vpsllq         \$20,$T3,$T3
3682        vpaddq          $D1hi,$D2lo,$D2lo
3683
3684        vpsrlq          \$42,$D2lo,$tmp
3685        vpsllq          \$10,$D2hi,$D2hi
3686        vpandq          $mask42,$D2lo,$H2
3687        vpaddq          $tmp,$D2hi,$D2hi
3688
3689          vpaddq        $T2,$H2,$H2             # accumulate input
3690        vpaddq          $D2hi,$H0,$H0
3691        vpsllq          \$2,$D2hi,$D2hi
3692
3693        vpaddq          $D2hi,$H0,$H0
3694         vporq          $T3,$T1,$T1
3695         vpandq         $mask44,$T1,$T1
3696
3697        vpsrlq          \$44,$H0,$tmp           # additional step
3698        vpandq          $mask44,$H0,$H0
3699
3700        vpaddq          $tmp,$H1,$H1
3701
3702        sub             \$8,$len                # len-=128
3703        jnz             .Loop_vpmadd52_8x
3704
3705.Ltail_vpmadd52_8x:
3706        #vpaddq         $T2,$H2,$H2             # accumulate input
3707        vpaddq          $T0,$H0,$H0
3708        vpaddq          $T1,$H1,$H1
3709
3710        vpxorq          $D0lo,$D0lo,$D0lo
3711        vpmadd52luq     $H2,$SS1,$D0lo
3712        vpxorq          $D0hi,$D0hi,$D0hi
3713        vpmadd52huq     $H2,$SS1,$D0hi
3714        vpxorq          $D1lo,$D1lo,$D1lo
3715        vpmadd52luq     $H2,$SS2,$D1lo
3716        vpxorq          $D1hi,$D1hi,$D1hi
3717        vpmadd52huq     $H2,$SS2,$D1hi
3718        vpxorq          $D2lo,$D2lo,$D2lo
3719        vpmadd52luq     $H2,$RR0,$D2lo
3720        vpxorq          $D2hi,$D2hi,$D2hi
3721        vpmadd52huq     $H2,$RR0,$D2hi
3722
3723        vpmadd52luq     $H0,$RR0,$D0lo
3724        vpmadd52huq     $H0,$RR0,$D0hi
3725        vpmadd52luq     $H0,$RR1,$D1lo
3726        vpmadd52huq     $H0,$RR1,$D1hi
3727        vpmadd52luq     $H0,$RR2,$D2lo
3728        vpmadd52huq     $H0,$RR2,$D2hi
3729
3730        vpmadd52luq     $H1,$SS2,$D0lo
3731        vpmadd52huq     $H1,$SS2,$D0hi
3732        vpmadd52luq     $H1,$RR0,$D1lo
3733        vpmadd52huq     $H1,$RR0,$D1hi
3734        vpmadd52luq     $H1,$RR1,$D2lo
3735        vpmadd52huq     $H1,$RR1,$D2hi
3736
3737        ################################################################
3738        # horizontal addition
3739
3740        mov             \$1,%eax
3741        kmovw           %eax,%k1
3742        vpsrldq         \$8,$D0lo,$T0
3743        vpsrldq         \$8,$D0hi,$H0
3744        vpsrldq         \$8,$D1lo,$T1
3745        vpsrldq         \$8,$D1hi,$H1
3746        vpaddq          $T0,$D0lo,$D0lo
3747        vpaddq          $H0,$D0hi,$D0hi
3748        vpsrldq         \$8,$D2lo,$T2
3749        vpsrldq         \$8,$D2hi,$H2
3750        vpaddq          $T1,$D1lo,$D1lo
3751        vpaddq          $H1,$D1hi,$D1hi
3752         vpermq         \$0x2,$D0lo,$T0
3753         vpermq         \$0x2,$D0hi,$H0
3754        vpaddq          $T2,$D2lo,$D2lo
3755        vpaddq          $H2,$D2hi,$D2hi
3756
3757        vpermq          \$0x2,$D1lo,$T1
3758        vpermq          \$0x2,$D1hi,$H1
3759        vpaddq          $T0,$D0lo,$D0lo
3760        vpaddq          $H0,$D0hi,$D0hi
3761        vpermq          \$0x2,$D2lo,$T2
3762        vpermq          \$0x2,$D2hi,$H2
3763        vpaddq          $T1,$D1lo,$D1lo
3764        vpaddq          $H1,$D1hi,$D1hi
3765         vextracti64x4  \$1,$D0lo,%y#$T0
3766         vextracti64x4  \$1,$D0hi,%y#$H0
3767        vpaddq          $T2,$D2lo,$D2lo
3768        vpaddq          $H2,$D2hi,$D2hi
3769
3770        vextracti64x4   \$1,$D1lo,%y#$T1
3771        vextracti64x4   \$1,$D1hi,%y#$H1
3772        vextracti64x4   \$1,$D2lo,%y#$T2
3773        vextracti64x4   \$1,$D2hi,%y#$H2
3774___
3775######## switch back to %ymm
3776map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3777map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3778map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3779
3780$code.=<<___;
3781        vpaddq          $T0,$D0lo,${D0lo}{%k1}{z}
3782        vpaddq          $H0,$D0hi,${D0hi}{%k1}{z}
3783        vpaddq          $T1,$D1lo,${D1lo}{%k1}{z}
3784        vpaddq          $H1,$D1hi,${D1hi}{%k1}{z}
3785        vpaddq          $T2,$D2lo,${D2lo}{%k1}{z}
3786        vpaddq          $H2,$D2hi,${D2hi}{%k1}{z}
3787
3788        ################################################################
3789        # partial reduction
3790        vpsrlq          \$44,$D0lo,$tmp
3791        vpsllq          \$8,$D0hi,$D0hi
3792        vpandq          $mask44,$D0lo,$H0
3793        vpaddq          $tmp,$D0hi,$D0hi
3794
3795        vpaddq          $D0hi,$D1lo,$D1lo
3796
3797        vpsrlq          \$44,$D1lo,$tmp
3798        vpsllq          \$8,$D1hi,$D1hi
3799        vpandq          $mask44,$D1lo,$H1
3800        vpaddq          $tmp,$D1hi,$D1hi
3801
3802        vpaddq          $D1hi,$D2lo,$D2lo
3803
3804        vpsrlq          \$42,$D2lo,$tmp
3805        vpsllq          \$10,$D2hi,$D2hi
3806        vpandq          $mask42,$D2lo,$H2
3807        vpaddq          $tmp,$D2hi,$D2hi
3808
3809        vpaddq          $D2hi,$H0,$H0
3810        vpsllq          \$2,$D2hi,$D2hi
3811
3812        vpaddq          $D2hi,$H0,$H0
3813
3814        vpsrlq          \$44,$H0,$tmp           # additional step
3815        vpandq          $mask44,$H0,$H0
3816
3817        vpaddq          $tmp,$H1,$H1
3818
3819        ################################################################
3820
3821        vmovq           %x#$H0,0($ctx)
3822        vmovq           %x#$H1,8($ctx)
3823        vmovq           %x#$H2,16($ctx)
3824        vzeroall
3825
3826.Lno_data_vpmadd52_8x:
3827        ret
3828.size   poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3829___
3830}
3831$code.=<<___;
3832.type   poly1305_emit_base2_44,\@function,3
3833.align  32
3834poly1305_emit_base2_44:
3835        mov     0($ctx),%r8     # load hash value
3836        mov     8($ctx),%r9
3837        mov     16($ctx),%r10
3838
3839        mov     %r9,%rax
3840        shr     \$20,%r9
3841        shl     \$44,%rax
3842        mov     %r10,%rcx
3843        shr     \$40,%r10
3844        shl     \$24,%rcx
3845
3846        add     %rax,%r8
3847        adc     %rcx,%r9
3848        adc     \$0,%r10
3849
3850        mov     %r8,%rax
3851        add     \$5,%r8         # compare to modulus
3852        mov     %r9,%rcx
3853        adc     \$0,%r9
3854        adc     \$0,%r10
3855        shr     \$2,%r10        # did 130-bit value overflow?
3856        cmovnz  %r8,%rax
3857        cmovnz  %r9,%rcx
3858
3859        add     0($nonce),%rax  # accumulate nonce
3860        adc     8($nonce),%rcx
3861        mov     %rax,0($mac)    # write result
3862        mov     %rcx,8($mac)
3863
3864        ret
3865.size   poly1305_emit_base2_44,.-poly1305_emit_base2_44
3866___
3867}       }       }
3868}
3869
3870if (!$kernel)
3871{       # chacha20-poly1305 helpers
3872my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
3873                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
3874$code.=<<___;
3875.globl  xor128_encrypt_n_pad
3876.type   xor128_encrypt_n_pad,\@abi-omnipotent
3877.align  16
3878xor128_encrypt_n_pad:
3879        sub     $otp,$inp
3880        sub     $otp,$out
3881        mov     $len,%r10               # put len aside
3882        shr     \$4,$len                # len / 16
3883        jz      .Ltail_enc
3884        nop
3885.Loop_enc_xmm:
3886        movdqu  ($inp,$otp),%xmm0
3887        pxor    ($otp),%xmm0
3888        movdqu  %xmm0,($out,$otp)
3889        movdqa  %xmm0,($otp)
3890        lea     16($otp),$otp
3891        dec     $len
3892        jnz     .Loop_enc_xmm
3893
3894        and     \$15,%r10               # len % 16
3895        jz      .Ldone_enc
3896
3897.Ltail_enc:
3898        mov     \$16,$len
3899        sub     %r10,$len
3900        xor     %eax,%eax
3901.Loop_enc_byte:
3902        mov     ($inp,$otp),%al
3903        xor     ($otp),%al
3904        mov     %al,($out,$otp)
3905        mov     %al,($otp)
3906        lea     1($otp),$otp
3907        dec     %r10
3908        jnz     .Loop_enc_byte
3909
3910        xor     %eax,%eax
3911.Loop_enc_pad:
3912        mov     %al,($otp)
3913        lea     1($otp),$otp
3914        dec     $len
3915        jnz     .Loop_enc_pad
3916
3917.Ldone_enc:
3918        mov     $otp,%rax
3919        ret
3920.size   xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3921
3922.globl  xor128_decrypt_n_pad
3923.type   xor128_decrypt_n_pad,\@abi-omnipotent
3924.align  16
3925xor128_decrypt_n_pad:
3926        sub     $otp,$inp
3927        sub     $otp,$out
3928        mov     $len,%r10               # put len aside
3929        shr     \$4,$len                # len / 16
3930        jz      .Ltail_dec
3931        nop
3932.Loop_dec_xmm:
3933        movdqu  ($inp,$otp),%xmm0
3934        movdqa  ($otp),%xmm1
3935        pxor    %xmm0,%xmm1
3936        movdqu  %xmm1,($out,$otp)
3937        movdqa  %xmm0,($otp)
3938        lea     16($otp),$otp
3939        dec     $len
3940        jnz     .Loop_dec_xmm
3941
3942        pxor    %xmm1,%xmm1
3943        and     \$15,%r10               # len % 16
3944        jz      .Ldone_dec
3945
3946.Ltail_dec:
3947        mov     \$16,$len
3948        sub     %r10,$len
3949        xor     %eax,%eax
3950        xor     %r11d,%r11d
3951.Loop_dec_byte:
3952        mov     ($inp,$otp),%r11b
3953        mov     ($otp),%al
3954        xor     %r11b,%al
3955        mov     %al,($out,$otp)
3956        mov     %r11b,($otp)
3957        lea     1($otp),$otp
3958        dec     %r10
3959        jnz     .Loop_dec_byte
3960
3961        xor     %eax,%eax
3962.Loop_dec_pad:
3963        mov     %al,($otp)
3964        lea     1($otp),$otp
3965        dec     $len
3966        jnz     .Loop_dec_pad
3967
3968.Ldone_dec:
3969        mov     $otp,%rax
3970        ret
3971.size   xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3972___
3973}
3974
3975# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3976#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
3977if ($win64) {
3978$rec="%rcx";
3979$frame="%rdx";
3980$context="%r8";
3981$disp="%r9";
3982
3983$code.=<<___;
3984.extern __imp_RtlVirtualUnwind
3985.type   se_handler,\@abi-omnipotent
3986.align  16
3987se_handler:
3988        push    %rsi
3989        push    %rdi
3990        push    %rbx
3991        push    %rbp
3992        push    %r12
3993        push    %r13
3994        push    %r14
3995        push    %r15
3996        pushfq
3997        sub     \$64,%rsp
3998
3999        mov     120($context),%rax      # pull context->Rax
4000        mov     248($context),%rbx      # pull context->Rip
4001
4002        mov     8($disp),%rsi           # disp->ImageBase
4003        mov     56($disp),%r11          # disp->HandlerData
4004
4005        mov     0(%r11),%r10d           # HandlerData[0]
4006        lea     (%rsi,%r10),%r10        # prologue label
4007        cmp     %r10,%rbx               # context->Rip<.Lprologue
4008        jb      .Lcommon_seh_tail
4009
4010        mov     152($context),%rax      # pull context->Rsp
4011
4012        mov     4(%r11),%r10d           # HandlerData[1]
4013        lea     (%rsi,%r10),%r10        # epilogue label
4014        cmp     %r10,%rbx               # context->Rip>=.Lepilogue
4015        jae     .Lcommon_seh_tail
4016
4017        lea     48(%rax),%rax
4018
4019        mov     -8(%rax),%rbx
4020        mov     -16(%rax),%rbp
4021        mov     -24(%rax),%r12
4022        mov     -32(%rax),%r13
4023        mov     -40(%rax),%r14
4024        mov     -48(%rax),%r15
4025        mov     %rbx,144($context)      # restore context->Rbx
4026        mov     %rbp,160($context)      # restore context->Rbp
4027        mov     %r12,216($context)      # restore context->R12
4028        mov     %r13,224($context)      # restore context->R13
4029        mov     %r14,232($context)      # restore context->R14
4030        mov     %r15,240($context)      # restore context->R14
4031
4032        jmp     .Lcommon_seh_tail
4033.size   se_handler,.-se_handler
4034
4035.type   avx_handler,\@abi-omnipotent
4036.align  16
4037avx_handler:
4038        push    %rsi
4039        push    %rdi
4040        push    %rbx
4041        push    %rbp
4042        push    %r12
4043        push    %r13
4044        push    %r14
4045        push    %r15
4046        pushfq
4047        sub     \$64,%rsp
4048
4049        mov     120($context),%rax      # pull context->Rax
4050        mov     248($context),%rbx      # pull context->Rip
4051
4052        mov     8($disp),%rsi           # disp->ImageBase
4053        mov     56($disp),%r11          # disp->HandlerData
4054
4055        mov     0(%r11),%r10d           # HandlerData[0]
4056        lea     (%rsi,%r10),%r10        # prologue label
4057        cmp     %r10,%rbx               # context->Rip<prologue label
4058        jb      .Lcommon_seh_tail
4059
4060        mov     152($context),%rax      # pull context->Rsp
4061
4062        mov     4(%r11),%r10d           # HandlerData[1]
4063        lea     (%rsi,%r10),%r10        # epilogue label
4064        cmp     %r10,%rbx               # context->Rip>=epilogue label
4065        jae     .Lcommon_seh_tail
4066
4067        mov     208($context),%rax      # pull context->R11
4068
4069        lea     0x50(%rax),%rsi
4070        lea     0xf8(%rax),%rax
4071        lea     512($context),%rdi      # &context.Xmm6
4072        mov     \$20,%ecx
4073        .long   0xa548f3fc              # cld; rep movsq
4074
4075.Lcommon_seh_tail:
4076        mov     8(%rax),%rdi
4077        mov     16(%rax),%rsi
4078        mov     %rax,152($context)      # restore context->Rsp
4079        mov     %rsi,168($context)      # restore context->Rsi
4080        mov     %rdi,176($context)      # restore context->Rdi
4081
4082        mov     40($disp),%rdi          # disp->ContextRecord
4083        mov     $context,%rsi           # context
4084        mov     \$154,%ecx              # sizeof(CONTEXT)
4085        .long   0xa548f3fc              # cld; rep movsq
4086
4087        mov     $disp,%rsi
4088        xor     %ecx,%ecx               # arg1, UNW_FLAG_NHANDLER
4089        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
4090        mov     0(%rsi),%r8             # arg3, disp->ControlPc
4091        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
4092        mov     40(%rsi),%r10           # disp->ContextRecord
4093        lea     56(%rsi),%r11           # &disp->HandlerData
4094        lea     24(%rsi),%r12           # &disp->EstablisherFrame
4095        mov     %r10,32(%rsp)           # arg5
4096        mov     %r11,40(%rsp)           # arg6
4097        mov     %r12,48(%rsp)           # arg7
4098        mov     %rcx,56(%rsp)           # arg8, (NULL)
4099        call    *__imp_RtlVirtualUnwind(%rip)
4100
4101        mov     \$1,%eax                # ExceptionContinueSearch
4102        add     \$64,%rsp
4103        popfq
4104        pop     %r15
4105        pop     %r14
4106        pop     %r13
4107        pop     %r12
4108        pop     %rbp
4109        pop     %rbx
4110        pop     %rdi
4111        pop     %rsi
4112        ret
4113.size   avx_handler,.-avx_handler
4114
4115.section        .pdata
4116.align  4
4117        .rva    .LSEH_begin_poly1305_init_x86_64
4118        .rva    .LSEH_end_poly1305_init_x86_64
4119        .rva    .LSEH_info_poly1305_init_x86_64
4120
4121        .rva    .LSEH_begin_poly1305_blocks_x86_64
4122        .rva    .LSEH_end_poly1305_blocks_x86_64
4123        .rva    .LSEH_info_poly1305_blocks_x86_64
4124
4125        .rva    .LSEH_begin_poly1305_emit_x86_64
4126        .rva    .LSEH_end_poly1305_emit_x86_64
4127        .rva    .LSEH_info_poly1305_emit_x86_64
4128___
4129$code.=<<___ if ($avx);
4130        .rva    .LSEH_begin_poly1305_blocks_avx
4131        .rva    .Lbase2_64_avx
4132        .rva    .LSEH_info_poly1305_blocks_avx_1
4133
4134        .rva    .Lbase2_64_avx
4135        .rva    .Leven_avx
4136        .rva    .LSEH_info_poly1305_blocks_avx_2
4137
4138        .rva    .Leven_avx
4139        .rva    .LSEH_end_poly1305_blocks_avx
4140        .rva    .LSEH_info_poly1305_blocks_avx_3
4141
4142        .rva    .LSEH_begin_poly1305_emit_avx
4143        .rva    .LSEH_end_poly1305_emit_avx
4144        .rva    .LSEH_info_poly1305_emit_avx
4145___
4146$code.=<<___ if ($avx>1);
4147        .rva    .LSEH_begin_poly1305_blocks_avx2
4148        .rva    .Lbase2_64_avx2
4149        .rva    .LSEH_info_poly1305_blocks_avx2_1
4150
4151        .rva    .Lbase2_64_avx2
4152        .rva    .Leven_avx2
4153        .rva    .LSEH_info_poly1305_blocks_avx2_2
4154
4155        .rva    .Leven_avx2
4156        .rva    .LSEH_end_poly1305_blocks_avx2
4157        .rva    .LSEH_info_poly1305_blocks_avx2_3
4158___
4159$code.=<<___ if ($avx>2);
4160        .rva    .LSEH_begin_poly1305_blocks_avx512
4161        .rva    .LSEH_end_poly1305_blocks_avx512
4162        .rva    .LSEH_info_poly1305_blocks_avx512
4163___
4164$code.=<<___;
4165.section        .xdata
4166.align  8
4167.LSEH_info_poly1305_init_x86_64:
4168        .byte   9,0,0,0
4169        .rva    se_handler
4170        .rva    .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
4171
4172.LSEH_info_poly1305_blocks_x86_64:
4173        .byte   9,0,0,0
4174        .rva    se_handler
4175        .rva    .Lblocks_body,.Lblocks_epilogue
4176
4177.LSEH_info_poly1305_emit_x86_64:
4178        .byte   9,0,0,0
4179        .rva    se_handler
4180        .rva    .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
4181___
4182$code.=<<___ if ($avx);
4183.LSEH_info_poly1305_blocks_avx_1:
4184        .byte   9,0,0,0
4185        .rva    se_handler
4186        .rva    .Lblocks_avx_body,.Lblocks_avx_epilogue         # HandlerData[]
4187
4188.LSEH_info_poly1305_blocks_avx_2:
4189        .byte   9,0,0,0
4190        .rva    se_handler
4191        .rva    .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue     # HandlerData[]
4192
4193.LSEH_info_poly1305_blocks_avx_3:
4194        .byte   9,0,0,0
4195        .rva    avx_handler
4196        .rva    .Ldo_avx_body,.Ldo_avx_epilogue                 # HandlerData[]
4197
4198.LSEH_info_poly1305_emit_avx:
4199        .byte   9,0,0,0
4200        .rva    se_handler
4201        .rva    .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4202___
4203$code.=<<___ if ($avx>1);
4204.LSEH_info_poly1305_blocks_avx2_1:
4205        .byte   9,0,0,0
4206        .rva    se_handler
4207        .rva    .Lblocks_avx2_body,.Lblocks_avx2_epilogue       # HandlerData[]
4208
4209.LSEH_info_poly1305_blocks_avx2_2:
4210        .byte   9,0,0,0
4211        .rva    se_handler
4212        .rva    .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue   # HandlerData[]
4213
4214.LSEH_info_poly1305_blocks_avx2_3:
4215        .byte   9,0,0,0
4216        .rva    avx_handler
4217        .rva    .Ldo_avx2_body,.Ldo_avx2_epilogue               # HandlerData[]
4218___
4219$code.=<<___ if ($avx>2);
4220.LSEH_info_poly1305_blocks_avx512:
4221        .byte   9,0,0,0
4222        .rva    avx_handler
4223        .rva    .Ldo_avx512_body,.Ldo_avx512_epilogue           # HandlerData[]
4224___
4225}
4226
4227open SELF,$0;
4228while(<SELF>) {
4229        next if (/^#!/);
4230        last if (!s/^#/\/\// and !/^$/);
4231        print;
4232}
4233close SELF;
4234
4235foreach (split('\n',$code)) {
4236        s/\`([^\`]*)\`/eval($1)/ge;
4237        s/%r([a-z]+)#d/%e$1/g;
4238        s/%r([0-9]+)#d/%r$1d/g;
4239        s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4240
4241        if ($kernel) {
4242                s/(^\.type.*),[0-9]+$/\1/;
4243                s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
4244                next if /^\.cfi.*/;
4245        }
4246
4247        print $_,"\n";
4248}
4249close STDOUT;
4250