@@ -183,11 +183,31 @@ GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
183
183
#define shiftD1Right BYTE $0x66 ; BYTE $0x45 ; BYTE $0x0f ; BYTE $0x3a ; BYTE $0x0f ; BYTE $0xd2 ; BYTE $0x04 // PALIGNR $4, X10, X10
184
184
#define shiftD2Right BYTE $0x66 ; BYTE $0x45 ; BYTE $0x0f ; BYTE $0x3a ; BYTE $0x0f ; BYTE $0xdb ; BYTE $0x04 // PALIGNR $4, X11, X11
185
185
#define shiftD3Right BYTE $0x66 ; BYTE $0x45 ; BYTE $0x0f ; BYTE $0x3a ; BYTE $0x0f ; BYTE $0xff ; BYTE $0x04 // PALIGNR $4, X15, X15
186
+
186
187
// Some macros
188
+
189
+ // ROL rotates the uint32s in register R left by N bits, using temporary T.
190
+ #define ROL (N, R, T) \
191
+ MOVO R, T; PSLLL $(N), T; PSRLL $(32 -(N)), R; PXOR T, R
192
+
193
+ // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
194
+ #ifdef GOAMD64_v2
195
+ #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
196
+ #else
197
+ #define ROL16(R, T) ROL (16 , R, T)
198
+ #endif
199
+
200
+ // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
201
+ #ifdef GOAMD64_v2
202
+ #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
203
+ #else
204
+ #define ROL8(R, T) ROL (8 , R, T)
205
+ #endif
206
+
187
207
#define chachaQR(A, B, C, D, T) \
188
- PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
208
+ PADDD B, A; PXOR A, D; ROL16(D, T) \
189
209
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12 , T; PSRLL $20 , B; PXOR T, B \
190
- PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
210
+ PADDD B, A; PXOR A, D; ROL8(D, T) \
191
211
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7 , T; PSRLL $25 , B; PXOR T, B
192
212
193
213
#define chachaQR_AVX2(A, B, C, D, T) \
0 commit comments