|
15 | 15 | .align 16
|
16 | 16 |
|
17 | 17 | ANMASK: .octa 0x0000000003ffffff0000000003ffffff
|
| 18 | +ORMASK: .octa 0x00000000010000000000000001000000 |
18 | 19 |
|
19 | 20 | .text
|
20 | 21 |
|
@@ -274,3 +275,308 @@ ENTRY(poly1305_block_sse2)
|
274 | 275 | pop %rbx
|
275 | 276 | ret
|
276 | 277 | ENDPROC(poly1305_block_sse2)
|
| 278 | + |
| 279 | + |
| 280 | +#define u0 0x00(%r8) |
| 281 | +#define u1 0x04(%r8) |
| 282 | +#define u2 0x08(%r8) |
| 283 | +#define u3 0x0c(%r8) |
| 284 | +#define u4 0x10(%r8) |
| 285 | +#define hc0 %xmm0 |
| 286 | +#define hc1 %xmm1 |
| 287 | +#define hc2 %xmm2 |
| 288 | +#define hc3 %xmm5 |
| 289 | +#define hc4 %xmm6 |
| 290 | +#define ru0 %xmm7 |
| 291 | +#define ru1 %xmm8 |
| 292 | +#define ru2 %xmm9 |
| 293 | +#define ru3 %xmm10 |
| 294 | +#define ru4 %xmm11 |
| 295 | +#define sv1 %xmm12 |
| 296 | +#define sv2 %xmm13 |
| 297 | +#define sv3 %xmm14 |
| 298 | +#define sv4 %xmm15 |
| 299 | +#undef d0 |
| 300 | +#define d0 %r13 |
| 301 | + |
| 302 | +ENTRY(poly1305_2block_sse2) |
| 303 | + # %rdi: Accumulator h[5] |
| 304 | + # %rsi: 16 byte input block m |
| 305 | + # %rdx: Poly1305 key r[5] |
| 306 | + # %rcx: Doubleblock count |
| 307 | + # %r8: Poly1305 derived key r^2 u[5] |
| 308 | + |
| 309 | + # This two-block variant further improves performance by using loop |
| 310 | + # unrolled block processing. This is more straight forward and does |
| 311 | + # less byte shuffling, but requires a second Poly1305 key r^2: |
| 312 | + # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r |
| 313 | + |
| 314 | + push %rbx |
| 315 | + push %r12 |
| 316 | + push %r13 |
| 317 | + |
| 318 | + # combine r0,u0 |
| 319 | + movd u0,ru0 |
| 320 | + movd r0,t1 |
| 321 | + punpcklqdq t1,ru0 |
| 322 | + |
| 323 | + # combine r1,u1 and s1=r1*5,v1=u1*5 |
| 324 | + movd u1,ru1 |
| 325 | + movd r1,t1 |
| 326 | + punpcklqdq t1,ru1 |
| 327 | + movdqa ru1,sv1 |
| 328 | + pslld $2,sv1 |
| 329 | + paddd ru1,sv1 |
| 330 | + |
| 331 | + # combine r2,u2 and s2=r2*5,v2=u2*5 |
| 332 | + movd u2,ru2 |
| 333 | + movd r2,t1 |
| 334 | + punpcklqdq t1,ru2 |
| 335 | + movdqa ru2,sv2 |
| 336 | + pslld $2,sv2 |
| 337 | + paddd ru2,sv2 |
| 338 | + |
| 339 | + # combine r3,u3 and s3=r3*5,v3=u3*5 |
| 340 | + movd u3,ru3 |
| 341 | + movd r3,t1 |
| 342 | + punpcklqdq t1,ru3 |
| 343 | + movdqa ru3,sv3 |
| 344 | + pslld $2,sv3 |
| 345 | + paddd ru3,sv3 |
| 346 | + |
| 347 | + # combine r4,u4 and s4=r4*5,v4=u4*5 |
| 348 | + movd u4,ru4 |
| 349 | + movd r4,t1 |
| 350 | + punpcklqdq t1,ru4 |
| 351 | + movdqa ru4,sv4 |
| 352 | + pslld $2,sv4 |
| 353 | + paddd ru4,sv4 |
| 354 | + |
| 355 | +.Ldoblock2: |
| 356 | + # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] |
| 357 | + movd 0x00(m),hc0 |
| 358 | + movd 0x10(m),t1 |
| 359 | + punpcklqdq t1,hc0 |
| 360 | + pand ANMASK(%rip),hc0 |
| 361 | + movd h0,t1 |
| 362 | + paddd t1,hc0 |
| 363 | + # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] |
| 364 | + movd 0x03(m),hc1 |
| 365 | + movd 0x13(m),t1 |
| 366 | + punpcklqdq t1,hc1 |
| 367 | + psrld $2,hc1 |
| 368 | + pand ANMASK(%rip),hc1 |
| 369 | + movd h1,t1 |
| 370 | + paddd t1,hc1 |
| 371 | + # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] |
| 372 | + movd 0x06(m),hc2 |
| 373 | + movd 0x16(m),t1 |
| 374 | + punpcklqdq t1,hc2 |
| 375 | + psrld $4,hc2 |
| 376 | + pand ANMASK(%rip),hc2 |
| 377 | + movd h2,t1 |
| 378 | + paddd t1,hc2 |
| 379 | + # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] |
| 380 | + movd 0x09(m),hc3 |
| 381 | + movd 0x19(m),t1 |
| 382 | + punpcklqdq t1,hc3 |
| 383 | + psrld $6,hc3 |
| 384 | + pand ANMASK(%rip),hc3 |
| 385 | + movd h3,t1 |
| 386 | + paddd t1,hc3 |
| 387 | + # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] |
| 388 | + movd 0x0c(m),hc4 |
| 389 | + movd 0x1c(m),t1 |
| 390 | + punpcklqdq t1,hc4 |
| 391 | + psrld $8,hc4 |
| 392 | + por ORMASK(%rip),hc4 |
| 393 | + movd h4,t1 |
| 394 | + paddd t1,hc4 |
| 395 | + |
| 396 | + # t1 = [ hc0[1] * r0, hc0[0] * u0 ] |
| 397 | + movdqa ru0,t1 |
| 398 | + pmuludq hc0,t1 |
| 399 | + # t1 += [ hc1[1] * s4, hc1[0] * v4 ] |
| 400 | + movdqa sv4,t2 |
| 401 | + pmuludq hc1,t2 |
| 402 | + paddq t2,t1 |
| 403 | + # t1 += [ hc2[1] * s3, hc2[0] * v3 ] |
| 404 | + movdqa sv3,t2 |
| 405 | + pmuludq hc2,t2 |
| 406 | + paddq t2,t1 |
| 407 | + # t1 += [ hc3[1] * s2, hc3[0] * v2 ] |
| 408 | + movdqa sv2,t2 |
| 409 | + pmuludq hc3,t2 |
| 410 | + paddq t2,t1 |
| 411 | + # t1 += [ hc4[1] * s1, hc4[0] * v1 ] |
| 412 | + movdqa sv1,t2 |
| 413 | + pmuludq hc4,t2 |
| 414 | + paddq t2,t1 |
| 415 | + # d0 = t1[0] + t1[1] |
| 416 | + movdqa t1,t2 |
| 417 | + psrldq $8,t2 |
| 418 | + paddq t2,t1 |
| 419 | + movq t1,d0 |
| 420 | + |
| 421 | + # t1 = [ hc0[1] * r1, hc0[0] * u1 ] |
| 422 | + movdqa ru1,t1 |
| 423 | + pmuludq hc0,t1 |
| 424 | + # t1 += [ hc1[1] * r0, hc1[0] * u0 ] |
| 425 | + movdqa ru0,t2 |
| 426 | + pmuludq hc1,t2 |
| 427 | + paddq t2,t1 |
| 428 | + # t1 += [ hc2[1] * s4, hc2[0] * v4 ] |
| 429 | + movdqa sv4,t2 |
| 430 | + pmuludq hc2,t2 |
| 431 | + paddq t2,t1 |
| 432 | + # t1 += [ hc3[1] * s3, hc3[0] * v3 ] |
| 433 | + movdqa sv3,t2 |
| 434 | + pmuludq hc3,t2 |
| 435 | + paddq t2,t1 |
| 436 | + # t1 += [ hc4[1] * s2, hc4[0] * v2 ] |
| 437 | + movdqa sv2,t2 |
| 438 | + pmuludq hc4,t2 |
| 439 | + paddq t2,t1 |
| 440 | + # d1 = t1[0] + t1[1] |
| 441 | + movdqa t1,t2 |
| 442 | + psrldq $8,t2 |
| 443 | + paddq t2,t1 |
| 444 | + movq t1,d1 |
| 445 | + |
| 446 | + # t1 = [ hc0[1] * r2, hc0[0] * u2 ] |
| 447 | + movdqa ru2,t1 |
| 448 | + pmuludq hc0,t1 |
| 449 | + # t1 += [ hc1[1] * r1, hc1[0] * u1 ] |
| 450 | + movdqa ru1,t2 |
| 451 | + pmuludq hc1,t2 |
| 452 | + paddq t2,t1 |
| 453 | + # t1 += [ hc2[1] * r0, hc2[0] * u0 ] |
| 454 | + movdqa ru0,t2 |
| 455 | + pmuludq hc2,t2 |
| 456 | + paddq t2,t1 |
| 457 | + # t1 += [ hc3[1] * s4, hc3[0] * v4 ] |
| 458 | + movdqa sv4,t2 |
| 459 | + pmuludq hc3,t2 |
| 460 | + paddq t2,t1 |
| 461 | + # t1 += [ hc4[1] * s3, hc4[0] * v3 ] |
| 462 | + movdqa sv3,t2 |
| 463 | + pmuludq hc4,t2 |
| 464 | + paddq t2,t1 |
| 465 | + # d2 = t1[0] + t1[1] |
| 466 | + movdqa t1,t2 |
| 467 | + psrldq $8,t2 |
| 468 | + paddq t2,t1 |
| 469 | + movq t1,d2 |
| 470 | + |
| 471 | + # t1 = [ hc0[1] * r3, hc0[0] * u3 ] |
| 472 | + movdqa ru3,t1 |
| 473 | + pmuludq hc0,t1 |
| 474 | + # t1 += [ hc1[1] * r2, hc1[0] * u2 ] |
| 475 | + movdqa ru2,t2 |
| 476 | + pmuludq hc1,t2 |
| 477 | + paddq t2,t1 |
| 478 | + # t1 += [ hc2[1] * r1, hc2[0] * u1 ] |
| 479 | + movdqa ru1,t2 |
| 480 | + pmuludq hc2,t2 |
| 481 | + paddq t2,t1 |
| 482 | + # t1 += [ hc3[1] * r0, hc3[0] * u0 ] |
| 483 | + movdqa ru0,t2 |
| 484 | + pmuludq hc3,t2 |
| 485 | + paddq t2,t1 |
| 486 | + # t1 += [ hc4[1] * s4, hc4[0] * v4 ] |
| 487 | + movdqa sv4,t2 |
| 488 | + pmuludq hc4,t2 |
| 489 | + paddq t2,t1 |
| 490 | + # d3 = t1[0] + t1[1] |
| 491 | + movdqa t1,t2 |
| 492 | + psrldq $8,t2 |
| 493 | + paddq t2,t1 |
| 494 | + movq t1,d3 |
| 495 | + |
| 496 | + # t1 = [ hc0[1] * r4, hc0[0] * u4 ] |
| 497 | + movdqa ru4,t1 |
| 498 | + pmuludq hc0,t1 |
| 499 | + # t1 += [ hc1[1] * r3, hc1[0] * u3 ] |
| 500 | + movdqa ru3,t2 |
| 501 | + pmuludq hc1,t2 |
| 502 | + paddq t2,t1 |
| 503 | + # t1 += [ hc2[1] * r2, hc2[0] * u2 ] |
| 504 | + movdqa ru2,t2 |
| 505 | + pmuludq hc2,t2 |
| 506 | + paddq t2,t1 |
| 507 | + # t1 += [ hc3[1] * r1, hc3[0] * u1 ] |
| 508 | + movdqa ru1,t2 |
| 509 | + pmuludq hc3,t2 |
| 510 | + paddq t2,t1 |
| 511 | + # t1 += [ hc4[1] * r0, hc4[0] * u0 ] |
| 512 | + movdqa ru0,t2 |
| 513 | + pmuludq hc4,t2 |
| 514 | + paddq t2,t1 |
| 515 | + # d4 = t1[0] + t1[1] |
| 516 | + movdqa t1,t2 |
| 517 | + psrldq $8,t2 |
| 518 | + paddq t2,t1 |
| 519 | + movq t1,d4 |
| 520 | + |
| 521 | + # d1 += d0 >> 26 |
| 522 | + mov d0,%rax |
| 523 | + shr $26,%rax |
| 524 | + add %rax,d1 |
| 525 | + # h0 = d0 & 0x3ffffff |
| 526 | + mov d0,%rbx |
| 527 | + and $0x3ffffff,%ebx |
| 528 | + |
| 529 | + # d2 += d1 >> 26 |
| 530 | + mov d1,%rax |
| 531 | + shr $26,%rax |
| 532 | + add %rax,d2 |
| 533 | + # h1 = d1 & 0x3ffffff |
| 534 | + mov d1,%rax |
| 535 | + and $0x3ffffff,%eax |
| 536 | + mov %eax,h1 |
| 537 | + |
| 538 | + # d3 += d2 >> 26 |
| 539 | + mov d2,%rax |
| 540 | + shr $26,%rax |
| 541 | + add %rax,d3 |
| 542 | + # h2 = d2 & 0x3ffffff |
| 543 | + mov d2,%rax |
| 544 | + and $0x3ffffff,%eax |
| 545 | + mov %eax,h2 |
| 546 | + |
| 547 | + # d4 += d3 >> 26 |
| 548 | + mov d3,%rax |
| 549 | + shr $26,%rax |
| 550 | + add %rax,d4 |
| 551 | + # h3 = d3 & 0x3ffffff |
| 552 | + mov d3,%rax |
| 553 | + and $0x3ffffff,%eax |
| 554 | + mov %eax,h3 |
| 555 | + |
| 556 | + # h0 += (d4 >> 26) * 5 |
| 557 | + mov d4,%rax |
| 558 | + shr $26,%rax |
| 559 | + lea (%eax,%eax,4),%eax |
| 560 | + add %eax,%ebx |
| 561 | + # h4 = d4 & 0x3ffffff |
| 562 | + mov d4,%rax |
| 563 | + and $0x3ffffff,%eax |
| 564 | + mov %eax,h4 |
| 565 | + |
| 566 | + # h1 += h0 >> 26 |
| 567 | + mov %ebx,%eax |
| 568 | + shr $26,%eax |
| 569 | + add %eax,h1 |
| 570 | + # h0 = h0 & 0x3ffffff |
| 571 | + andl $0x3ffffff,%ebx |
| 572 | + mov %ebx,h0 |
| 573 | + |
| 574 | + add $0x20,m |
| 575 | + dec %rcx |
| 576 | + jnz .Ldoblock2 |
| 577 | + |
| 578 | + pop %r13 |
| 579 | + pop %r12 |
| 580 | + pop %rbx |
| 581 | + ret |
| 582 | +ENDPROC(poly1305_2block_sse2) |
0 commit comments