@@ -71,61 +71,68 @@ void add_copy_packed_dim_offset_node(
71
71
const ivec3& range,
72
72
const ivec4& src_offset,
73
73
const ivec4& dst_offset,
74
- const ValueRef out) {
74
+ const ValueRef out,
75
+ bool repeat) {
75
76
vTensorPtr t_in = graph.get_tensor (in);
76
77
vTensorPtr t_out = graph.get_tensor (out);
77
78
78
- // Check the packed dimension is same for both tensors, and if the packed
79
- // dimension is Width or Height. Since the function does not support channel
80
- // packing.
81
- VK_CHECK_COND (
82
- check_same_packed_dim (*t_in, *t_out) &&
83
- (check_packed_dim_is (*t_in, WHCN::kWidthDim ) ||
84
- check_packed_dim_is (*t_in, WHCN::kHeightDim )));
79
+ // Check the packed dimension is same for both tensors
80
+ VK_CHECK_COND (check_same_packed_dim (*t_in, *t_out));
81
+ if (!repeat) {
82
+ // For non repeat copy also check if the packed dimension is Width or
83
+ // Height. Since the function does not support channel packing.
84
+ VK_CHECK_COND (
85
+ check_same_packed_dim (*t_in, *t_out) &&
86
+ (check_packed_dim_is (*t_in, WHCN::kWidthDim ) ||
87
+ check_packed_dim_is (*t_in, WHCN::kHeightDim )));
88
+ }
85
89
86
90
std::string kernel_name = " copy_packed_dim_offset" ;
87
91
kernel_name.reserve (kShaderNameReserve );
88
92
add_dtype_suffix (kernel_name, *t_out);
89
93
90
- const auto packed_dim = t_in->packed_dim ();
91
94
// A copy of range with the last element set to batch size of the input tensor
92
95
ivec4 final_range = {
93
96
range[0 ], range[1 ], range[2 ], dim_at (t_in->sizes (), kBatch4D )};
94
97
ivec3 global_wg_size = t_out->logical_limits ();
95
- // The starting offset in a texel where this tensor will start copying from
96
- const auto src_lane_offset = src_offset[packed_dim] & 0x3 ;
97
- // The starting offset in a texel where this tensor will start copying to
98
- const auto dst_lane_offset = dst_offset[packed_dim] & 0x3 ;
99
-
100
- // The total packed texels this tensor will be copied from
101
- // The first texel of tensor data in packed dimension will be copied from
102
- // remaining lanes from current source Hence (4 - src_lane_offset) is added
103
- // to tensor size in packed dimension
104
- const auto src_packed_size = utils::div_up_4 (
105
- (4 - src_lane_offset) +
106
- dim_at (t_out->sizes (), normalize_to_dim_index (*t_out, packed_dim)));
107
-
108
- // The total packed texels this tensor will be copied to
109
- // The first texel of tensor data in packed dimension will be copied to
110
- // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
111
- // tensor size in packed dimension
112
- const auto dst_packed_size = utils::div_up_4 (
113
- (4 - dst_lane_offset) +
114
- dim_at (t_in->sizes (), normalize_to_dim_index (*t_in, packed_dim)));
115
-
116
- // If the starting src offset is not 0, and the total packed texels is greater
117
- // than the source texel range
118
- const bool has_additional_src_work =
119
- src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
120
- // If the starting dst offset is not 0, and the total packed texels is greater
121
- // than the source texel range
122
- const bool has_additional_dst_work =
123
- dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
124
-
125
- if (has_additional_src_work || has_additional_dst_work) {
126
- global_wg_size[packed_dim]++; // Increase the global work group size in
127
- // packed dimension
128
- final_range[packed_dim]++; // Increase the range in packed dimension
98
+
99
+ if (!repeat) {
100
+ const auto packed_dim = t_in->packed_dim ();
101
+ // The starting offset in a texel where this tensor will start copying from
102
+ const auto src_lane_offset = src_offset[packed_dim] & 0x3 ;
103
+ // The starting offset in a texel where this tensor will start copying to
104
+ const auto dst_lane_offset = dst_offset[packed_dim] & 0x3 ;
105
+
106
+ // The total packed texels this tensor will be copied from
107
+ // The first texel of tensor data in packed dimension will be copied from
108
+ // remaining lanes from current source Hence (4 - src_lane_offset) is added
109
+ // to tensor size in packed dimension
110
+ const auto src_packed_size = utils::div_up_4 (
111
+ (4 - src_lane_offset) +
112
+ dim_at (t_out->sizes (), normalize_to_dim_index (*t_out, packed_dim)));
113
+
114
+ // The total packed texels this tensor will be copied to
115
+ // The first texel of tensor data in packed dimension will be copied to
116
+ // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
117
+ // to tensor size in packed dimension
118
+ const auto dst_packed_size = utils::div_up_4 (
119
+ (4 - dst_lane_offset) +
120
+ dim_at (t_in->sizes (), normalize_to_dim_index (*t_in, packed_dim)));
121
+
122
+ // If the starting src offset is not 0, and the total packed texels is
123
+ // greater than the source texel range
124
+ const bool has_additional_src_work =
125
+ src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
126
+ // If the starting dst offset is not 0, and the total packed texels is
127
+ // greater than the source texel range
128
+ const bool has_additional_dst_work =
129
+ dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
130
+
131
+ if (has_additional_src_work || has_additional_dst_work) {
132
+ global_wg_size[packed_dim]++; // Increase the global work group size in
133
+ // packed dimension
134
+ final_range[packed_dim]++; // Increase the range in packed dimension
135
+ }
129
136
}
130
137
131
138
auto shader = VK_KERNEL_FROM_STR (kernel_name);
@@ -144,7 +151,7 @@ void add_copy_packed_dim_offset_node(
144
151
// Parameter buffers
145
152
{},
146
153
// Specialization Constants
147
- {graph.hashed_layout_of (out), graph.hashed_layout_of (in)},
154
+ {graph.hashed_layout_of (out), graph.hashed_layout_of (in), repeat },
148
155
nullptr ,
149
156
{},
150
157
{
0 commit comments