@@ -132,24 +132,24 @@ Note that 14 and 12 nm Ryzen chips can only do 1 full width `fma` per clock cycl
132
132
133
133
We can also vectorize fancier loops. A likely familiar example to dive into:
134
134
``` julia
135
- julia> function mygemm! (𝐂, 𝐀, 𝐁 )
136
- @inbounds @fastmath for m ∈ 1 : size (𝐀 ,1 ), n ∈ 1 : size (𝐁 ,2 )
137
- 𝐂ₘₙ = zero (eltype (𝐂 ))
138
- for k ∈ 1 : size (𝐀 ,2 )
139
- 𝐂ₘₙ += 𝐀 [m,k] * 𝐁 [k,n]
135
+ julia> function mygemm! (C, A, B )
136
+ @inbounds @fastmath for m ∈ 1 : size (A ,1 ), n ∈ 1 : size (B ,2 )
137
+ Cmn = zero (eltype (C ))
138
+ for k ∈ 1 : size (A ,2 )
139
+ Cmn += A [m,k] * B [k,n]
140
140
end
141
- 𝐂 [m,n] = 𝐂ₘₙ
141
+ C [m,n] = Cmn
142
142
end
143
143
end
144
144
mygemm! (generic function with 1 method)
145
145
146
- julia> function mygemmavx! (𝐂, 𝐀, 𝐁 )
147
- @avx for m ∈ 1 : size (𝐀 ,1 ), n ∈ 1 : size (𝐁 ,2 )
148
- 𝐂ₘₙ = zero (eltype (𝐂 ))
149
- for k ∈ 1 : size (𝐀 ,2 )
150
- 𝐂ₘₙ += 𝐀 [m,k] * 𝐁 [k,n]
146
+ julia> function mygemmavx! (C, A, B )
147
+ @avx for m ∈ 1 : size (A ,1 ), n ∈ 1 : size (B ,2 )
148
+ Cmn = zero (eltype (C ))
149
+ for k ∈ 1 : size (A ,2 )
150
+ Cmn += A [m,k] * B [k,n]
151
151
end
152
- 𝐂 [m,n] = 𝐂ₘₙ
152
+ C [m,n] = Cmn
153
153
end
154
154
end
155
155
mygemmavx! (generic function with 1 method)
@@ -276,24 +276,22 @@ BLAS.set_num_threads(1); @show BLAS.vendor()
276
276
const MatrixFInt64 = Union{Matrix{Float64}, Matrix{Int}}
277
277
278
278
function mul_avx! (C:: MatrixFInt64 , A:: MatrixFInt64 , B:: MatrixFInt64 )
279
- z = zero (eltype (C))
280
- @avx for i ∈ 1 : size (A,1 ), j ∈ 1 : size (B,2 )
281
- Cᵢⱼ = z
279
+ @avx for m ∈ 1 : size (A,1 ), n ∈ 1 : size (B,2 )
280
+ Cmn = zero (eltype (C))
282
281
for k ∈ 1 : size (A,2 )
283
- Cᵢⱼ += A[i ,k] * B[k,j ]
282
+ Cmn += A[m ,k] * B[k,n ]
284
283
end
285
- C[i,j ] = Cᵢⱼ
284
+ C[m,n ] = Cmn
286
285
end
287
286
end
288
287
289
288
function mul_add_avx! (C:: MatrixFInt64 , A:: MatrixFInt64 , B:: MatrixFInt64 , factor= 1 )
290
- z = zero (eltype (C))
291
- @avx for i ∈ 1 : size (A,1 ), j ∈ 1 : size (B,2 )
292
- ΔCᵢⱼ = z
289
+ @avx for m ∈ 1 : size (A,1 ), n ∈ 1 : size (B,2 )
290
+ ΔCmn = zero (eltype (C))
293
291
for k ∈ 1 : size (A,2 )
294
- ΔCᵢⱼ += A[i ,k] * B[k,j ]
292
+ ΔCmn += A[m ,k] * B[k,n ]
295
293
end
296
- C[i,j ] += factor * ΔCᵢⱼ
294
+ C[m,n ] += factor * ΔCmn
297
295
end
298
296
end
299
297
0 commit comments