@@ -3206,3 +3206,104 @@ TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) {
3206
3206
test_transpose_view_mm (2 , 7 , 17 , 5 , storage_type);
3207
3207
}
3208
3208
}
3209
+
3210
+ void test_to_copy () {
3211
+ GraphConfig config;
3212
+ config.set_storage_type_override (utils::kTexture3D );
3213
+ ComputeGraph graph (config);
3214
+ int M = 8 ;
3215
+ int N = 8 ;
3216
+ int K = 8 ;
3217
+ // Build graph
3218
+ IOValueRef in = graph.add_input_tensor (
3219
+ {1 , M, N, K},
3220
+ vkapi::kFloat ,
3221
+ utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3222
+
3223
+ std::vector<float > data_in =
3224
+ create_random_float_buffer (M * N * K, -1024 , 1024 );
3225
+ graph.copy_into_staging (in.staging , data_in.data (), data_in.size ());
3226
+
3227
+ IOValueRef out;
3228
+ out.value = graph.add_tensor (
3229
+ {1 , M, N, K},
3230
+ vkapi::kHalf ,
3231
+ utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3232
+
3233
+ auto op = VK_GET_OP_FN (" aten._to_copy.default" );
3234
+ op (graph,
3235
+ {in.value ,
3236
+ graph.add_none (),
3237
+ graph.add_none (),
3238
+ graph.add_none (),
3239
+ graph.add_none (),
3240
+ graph.add_none (),
3241
+ graph.add_none (),
3242
+ out.value });
3243
+
3244
+ out.staging = graph.set_output_tensor (out.value );
3245
+
3246
+ graph.prepare ();
3247
+ graph.encode_prepack ();
3248
+ graph.prepack ();
3249
+ graph.encode_execute ();
3250
+ graph.propagate_resize ();
3251
+ graph.execute ();
3252
+
3253
+ std::vector<torch::executor::Half> output_data (graph.numel_of (out.value ));
3254
+ graph.copy_from_staging (out.staging , output_data.data (), output_data.size ());
3255
+
3256
+ EXPECT_EQ (data_in.size (), output_data.size ());
3257
+
3258
+ float mse_ex = 0 .0f ;
3259
+ float mse_vk = 0 .0f ;
3260
+
3261
+ // check results
3262
+ for (size_t i = 0 ; i < output_data.size (); ++i) {
3263
+ float input = data_in[i];
3264
+ torch::executor::Half expected_output =
3265
+ static_cast <torch::executor::Half>(input);
3266
+ uint16_t * expected_bits = reinterpret_cast <uint16_t *>(&expected_output);
3267
+ torch::executor::Half output = output_data[i];
3268
+ uint16_t * output_bits = reinterpret_cast <uint16_t *>(&output);
3269
+
3270
+ std::cout << " input = " << input << " (0b"
3271
+ << std::bitset<32 >(*reinterpret_cast <uint32_t *>(&input))
3272
+ << " ), expected output = " << expected_output << " (0b"
3273
+ << std::bitset<16 >(*expected_bits)
3274
+ << " ), recieved output = " << output << " (0b"
3275
+ << std::bitset<16 >(*output_bits) << " )" << std::endl;
3276
+
3277
+ // Note: Torch executor half "rounds up" when converting to fp16 whereas
3278
+ // most driver implementations of Vulkan's opFConvert() just truncates the
3279
+ // extra bits for performance (rounding introduces conditional).
3280
+ // Example:
3281
+ // INPUT F32 = 25.248 (sign{0b0}, exp{0b10000011},
3282
+ // mantissa{0b10010011111101111100111}),
3283
+ // TORCH HALF OUTPUT F16 = 25.25 (sign{0b0}, exp{0b10011},
3284
+ // mantissa{0b1001010000}),
3285
+ // VULKAN OUTPUT F16 = 25.2344 (sign{0b0}, exp{0b10011},
3286
+ // mantissa{0b1001001111})
3287
+ // Note:
3288
+ // The vulkan mantissa exactly matches the first 10
3289
+ // bits of the input 23 bit mantissa. But since the 11th bit is 1, the
3290
+ // torch half output is rounded up (essentially adding a 1).
3291
+ // Vulkan mantissa{0b1001001111} + 1 = Torch half mantissa{0b1001010000}
3292
+
3293
+ EXPECT_TRUE (
3294
+ (*output_bits == *expected_bits) ||
3295
+ /* rounding error*/ ((*output_bits + 1u ) == *expected_bits));
3296
+ mse_ex += std::pow (expected_output - input, 2 );
3297
+ mse_vk += std::pow (output - input, 2 );
3298
+ }
3299
+
3300
+ mse_ex /= output_data.size ();
3301
+ mse_vk /= output_data.size ();
3302
+ std::cout << " ========================================================="
3303
+ << std::endl;
3304
+ std::cout << " mse_ex = " << mse_ex << " , mse_vk = " << mse_vk << std::endl;
3305
+ }
3306
+
3307
+ TEST (VulkanComputeGraphOpsTest, test_to_copy) {
3308
+ test_to_copy ();
3309
+ }
0 commit comments