41
41
* /
42
42
.text
43
43
ENTRY(sha3_ce_transform)
44
- / * load state * /
45
- add x8 , x0 , # 32
46
- ld1 { v0.1d - v3.1d} , [ x0 ]
44
+ frame_push 4
45
+
46
+ mov x19 , x0
47
+ mov x20 , x1
48
+ mov x21 , x2
49
+ mov x22 , x3
50
+
51
+ 0 : / * load state * /
52
+ add x8 , x19 , # 32
53
+ ld1 { v0.1d - v3.1d} , [ x19 ]
47
54
ld1 { v4.1d - v7.1d} , [ x8 ], # 32
48
55
ld1 { v8.1d - v11.1d} , [ x8 ], # 32
49
56
ld1 {v12.1d - v15.1d} , [ x8 ], # 32
50
57
ld1 {v16.1d - v19.1d} , [ x8 ], # 32
51
58
ld1 {v20.1d - v23.1d} , [ x8 ], # 32
52
59
ld1 {v24.1d} , [ x8 ]
53
60
54
- 0 : sub w2 , w2 , # 1
61
+ 1 : sub w21 , w21 , # 1
55
62
mov w8 , # 24
56
63
adr_l x9 , .Lsha3_rcon
57
64
58
65
/ * load input * /
59
- ld1 {v25.8b - v28.8b} , [ x1 ], # 32
60
- ld1 {v29.8b - v31.8b} , [ x1 ], # 24
66
+ ld1 {v25.8b - v28.8b} , [ x20 ], # 32
67
+ ld1 {v29.8b - v31.8b} , [ x20 ], # 24
61
68
eor v0.8b , v0.8b , v25.8b
62
69
eor v1.8b , v1.8b , v26.8b
63
70
eor v2.8b , v2.8b , v27.8b
@@ -66,45 +73,45 @@ ENTRY(sha3_ce_transform)
66
73
eor v5.8b , v5.8b , v30.8b
67
74
eor v6.8b , v6.8b , v31.8b
68
75
69
- tbnz x3 , # 6 , 2f // SHA3 - 512
76
+ tbnz x22 , # 6 , 3f // SHA3 - 512
70
77
71
- ld1 {v25.8b - v28.8b} , [ x1 ], # 32
72
- ld1 {v29.8b - v30.8b} , [ x1 ], # 16
78
+ ld1 {v25.8b - v28.8b} , [ x20 ], # 32
79
+ ld1 {v29.8b - v30.8b} , [ x20 ], # 16
73
80
eor v7.8b , v7.8b , v25.8b
74
81
eor v8.8b , v8.8b , v26.8b
75
82
eor v9.8b , v9.8b , v27.8b
76
83
eor v10.8b , v10.8b , v28.8b
77
84
eor v11.8b , v11.8b , v29.8b
78
85
eor v12.8b , v12.8b , v30.8b
79
86
80
- tbnz x3 , # 4 , 1f // SHA3 - 384 or SHA3 - 224
87
+ tbnz x22 , # 4 , 2f // SHA3 - 384 or SHA3 - 224
81
88
82
89
// SHA3 - 256
83
- ld1 {v25.8b - v28.8b} , [ x1 ], # 32
90
+ ld1 {v25.8b - v28.8b} , [ x20 ], # 32
84
91
eor v13.8b , v13.8b , v25.8b
85
92
eor v14.8b , v14.8b , v26.8b
86
93
eor v15.8b , v15.8b , v27.8b
87
94
eor v16.8b , v16.8b , v28.8b
88
- b 3f
95
+ b 4f
89
96
90
- 1 : tbz x3 , # 2 , 3f // bit 2 cleared? SHA - 384
97
+ 2 : tbz x22 , # 2 , 4f // bit 2 cleared? SHA - 384
91
98
92
99
// SHA3 - 224
93
- ld1 {v25.8b - v28.8b} , [ x1 ], # 32
94
- ld1 {v29.8b} , [ x1 ], # 8
100
+ ld1 {v25.8b - v28.8b} , [ x20 ], # 32
101
+ ld1 {v29.8b} , [ x20 ], # 8
95
102
eor v13.8b , v13.8b , v25.8b
96
103
eor v14.8b , v14.8b , v26.8b
97
104
eor v15.8b , v15.8b , v27.8b
98
105
eor v16.8b , v16.8b , v28.8b
99
106
eor v17.8b , v17.8b , v29.8b
100
- b 3f
107
+ b 4f
101
108
102
109
// SHA3 - 512
103
- 2 : ld1 {v25.8b - v26.8b} , [ x1 ], # 16
110
+ 3 : ld1 {v25.8b - v26.8b} , [ x20 ], # 16
104
111
eor v7.8b , v7.8b , v25.8b
105
112
eor v8.8b , v8.8b , v26.8b
106
113
107
- 3 : sub w8 , w8 , # 1
114
+ 4 : sub w8 , w8 , # 1
108
115
109
116
eor3 v29.16b , v4.16b , v9.16b , v14.16b
110
117
eor3 v26.16b , v1.16b , v6.16b , v11.16b
@@ -183,17 +190,33 @@ ENTRY(sha3_ce_transform)
183
190
184
191
eor v0.16b , v0.16b , v31.16b
185
192
186
- cbnz w8 , 3b
187
- cbnz w2 , 0b
193
+ cbnz w8 , 4b
194
+ cbz w21 , 5f
195
+
196
+ if_will_cond_yield_neon
197
+ add x8 , x19 , # 32
198
+ st1 { v0.1d - v3.1d} , [ x19 ]
199
+ st1 { v4.1d - v7.1d} , [ x8 ], # 32
200
+ st1 { v8.1d - v11.1d} , [ x8 ], # 32
201
+ st1 {v12.1d - v15.1d} , [ x8 ], # 32
202
+ st1 {v16.1d - v19.1d} , [ x8 ], # 32
203
+ st1 {v20.1d - v23.1d} , [ x8 ], # 32
204
+ st1 {v24.1d} , [ x8 ]
205
+ do_cond_yield_neon
206
+ b 0b
207
+ endif_yield_neon
208
+
209
+ b 1b
188
210
189
211
/ * save state * /
190
- st1 { v0.1d - v3.1d} , [ x0 ], # 32
191
- st1 { v4.1d - v7.1d} , [ x0 ], # 32
192
- st1 { v8.1d - v11.1d} , [ x0 ], # 32
193
- st1 {v12.1d - v15.1d} , [ x0 ], # 32
194
- st1 {v16.1d - v19.1d} , [ x0 ], # 32
195
- st1 {v20.1d - v23.1d} , [ x0 ], # 32
196
- st1 {v24.1d} , [ x0 ]
212
+ 5 : st1 { v0.1d - v3.1d} , [ x19 ], # 32
213
+ st1 { v4.1d - v7.1d} , [ x19 ], # 32
214
+ st1 { v8.1d - v11.1d} , [ x19 ], # 32
215
+ st1 {v12.1d - v15.1d} , [ x19 ], # 32
216
+ st1 {v16.1d - v19.1d} , [ x19 ], # 32
217
+ st1 {v20.1d - v23.1d} , [ x19 ], # 32
218
+ st1 {v24.1d} , [ x19 ]
219
+ frame_pop
197
220
ret
198
221
ENDPROC(sha3_ce_transform)
199
222
0 commit comments