Skip to content

Commit 271f0e1

Browse files
committed
[ET-VK] Consolidate shader compilation into one vkCreateComputePipelines call
We target the QC Adreno driver implementation of Vulkan. The Vulkan API does not enforce how QC actually uses the cache. As the plural naming of `vkCreateComputePipelines` suggests, we observed that the `createInfoCount`, `pCreateInfos` and `pPipelines` arguments above allow construction of multiple compute pipelines in one invocation. We refactor ET-VK to accumulate metadata necessary for pipeline construction and invoke vkCreateComputePipelines only once. QC's implementation maximizes the cache if we create the same number of compute pipelines in fewer invocations of vkCreateComputePipelines. This decreases model load for a sample model from 1.7s to 1.0s, and down to 300ms once ssjia removes the noop shader. Differential Revision: [D75763660](https://our.internmc.facebook.com/intern/diff/D75763660/) ghstack-source-id: 287485414 Pull Request resolved: #11345
1 parent 879eee0 commit 271f0e1

File tree

9 files changed

+148
-1
lines changed

9 files changed

+148
-1
lines changed

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,38 @@ void ComputeGraph::update_descriptor_counts(
552552
}
553553
}
554554

555+
void ComputeGraph::update_pipeline_descriptors(
556+
const vkapi::ShaderInfo& shader_info,
557+
const utils::WorkgroupSize& local_workgroup_size,
558+
const vkapi::SpecVarList& spec_vars,
559+
const std::vector<PushConstantDataInfo>& push_constants) {
560+
VkDescriptorSetLayout shader_layout =
561+
context()->shader_layout_cache().retrieve(shader_info.kernel_layout);
562+
563+
uint32_t pc_offset = 0;
564+
std::array<uint8_t, kMaxPushConstantSize> pc_data;
565+
for (const auto& pc : push_constants) {
566+
pc_offset += pc.write(pc_data.data(), pc_offset, kMaxPushConstantSize);
567+
}
568+
569+
vkapi::SpecVarList spec_constants = {
570+
SV(local_workgroup_size[0u]),
571+
SV(local_workgroup_size[1u]),
572+
SV(local_workgroup_size[2u])};
573+
574+
spec_constants.append(spec_vars);
575+
576+
const vkapi::ComputePipelineCache::Key desc = {
577+
context()->pipeline_layout_cache().retrieve(shader_layout, pc_offset),
578+
context()->shader_cache().retrieve(shader_info),
579+
spec_constants};
580+
581+
auto it = pipeline_descriptors_.find(desc);
582+
if (it == pipeline_descriptors_.cend()) {
583+
pipeline_descriptors_.insert(desc);
584+
}
585+
}
586+
555587
utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
556588
if (is_buffer_storage(idx)) {
557589
return {uint32_t(numel_of(idx)), 1u, 1u};
@@ -659,6 +691,14 @@ void ComputeGraph::prepare() {
659691
shared_object.allocate(this);
660692
shared_object.bind_users(this);
661693
}
694+
695+
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
696+
node->prepare_pipelines(this);
697+
}
698+
for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
699+
node->prepare_pipelines(this);
700+
}
701+
context_->pipeline_cache().create_pipelines(pipeline_descriptors_);
662702
}
663703

664704
void ComputeGraph::encode_prepack() {

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ class ComputeGraph final {
185185
std::vector<IOValueRef> inputs_;
186186
std::vector<IOValueRef> outputs_;
187187

188+
std::unordered_set<
189+
vkapi::ComputePipelineCache::Key,
190+
vkapi::ComputePipelineCache::Hasher>
191+
pipeline_descriptors_;
192+
188193
protected:
189194
size_t values_in_use_ = 0;
190195
size_t execute_count_ = 0;
@@ -704,6 +709,12 @@ class ComputeGraph final {
704709
const vkapi::ShaderInfo& shader_info,
705710
bool execute);
706711

712+
void update_pipeline_descriptors(
713+
const vkapi::ShaderInfo& shader_info,
714+
const utils::WorkgroupSize& local_workgroup_size,
715+
const vkapi::SpecVarList& spec_vars,
716+
const std::vector<PushConstantDataInfo>& push_constants);
717+
707718
void prepare();
708719

709720
//

backends/vulkan/runtime/graph/ops/DispatchNode.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ DispatchNode::DispatchNode(
3535
graph.update_descriptor_counts(shader, /*execute = */ true);
3636
}
3737

38+
void DispatchNode::prepare_pipelines(ComputeGraph* graph) {
39+
graph->update_pipeline_descriptors(
40+
shader_, local_workgroup_size_, spec_vars_, push_constants_);
41+
}
42+
3843
void DispatchNode::encode(ComputeGraph* graph) {
3944
if (!shader_) {
4045
return;

backends/vulkan/runtime/graph/ops/DispatchNode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ class DispatchNode : public ExecuteNode {
4040

4141
~DispatchNode() override = default;
4242

43+
void prepare_pipelines(ComputeGraph* graph) override;
44+
4345
void encode(ComputeGraph* graph) override;
4446

4547
protected:

backends/vulkan/runtime/graph/ops/ExecuteNode.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ class ExecuteNode {
6161

6262
virtual ~ExecuteNode() = default;
6363

64+
virtual void prepare_pipelines(ComputeGraph* graph) {
65+
(void)graph;
66+
}
67+
6468
virtual void encode(ComputeGraph* graph) {
6569
(void)graph;
6670
}

backends/vulkan/runtime/graph/ops/PrepackNode.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,13 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
6767
return staging;
6868
}
6969

70+
void PrepackNode::prepare_pipelines(ComputeGraph* graph) {
71+
graph->update_pipeline_descriptors(
72+
shader_, local_workgroup_size_, spec_vars_, push_constants_);
73+
graph->update_pipeline_descriptors(
74+
noop_shader_, utils::WorkgroupSize(1, 1, 1), {}, push_constants_);
75+
}
76+
7077
void PrepackNode::encode(ComputeGraph* graph) {
7178
api::Context* const context = graph->context();
7279

backends/vulkan/runtime/graph/ops/PrepackNode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ class PrepackNode final {
4040

4141
~PrepackNode() = default;
4242

43+
void prepare_pipelines(ComputeGraph* graph);
44+
4345
void encode(ComputeGraph* graph);
4446

4547
inline void set_node_id(uint32_t node_id) {

backends/vulkan/runtime/vk_api/Pipeline.cpp

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,9 @@ void swap(PipelineLayout& lhs, PipelineLayout& rhs) noexcept {
270270
// ComputePipeline
271271
//
272272

273+
ComputePipeline::ComputePipeline(VkDevice device, VkPipeline handle)
274+
: device_{device}, handle_{handle} {}
275+
273276
ComputePipeline::ComputePipeline(
274277
VkDevice device,
275278
const ComputePipeline::Descriptor& descriptor,
@@ -444,12 +447,77 @@ ComputePipelineCache::~ComputePipelineCache() {
444447
pipeline_cache_ = VK_NULL_HANDLE;
445448
}
446449

450+
void ComputePipelineCache::create_pipelines(
451+
const std::unordered_set<Key, Hasher>& descriptors) {
452+
std::lock_guard<std::mutex> lock(cache_mutex_);
453+
454+
const auto num_pipelines = descriptors.size();
455+
std::vector<VkPipeline> pipelines(num_pipelines);
456+
457+
std::vector<std::vector<VkSpecializationMapEntry>> map_entries;
458+
map_entries.reserve(num_pipelines);
459+
460+
std::vector<VkSpecializationInfo> specialization_infos;
461+
specialization_infos.reserve(num_pipelines);
462+
463+
std::vector<VkPipelineShaderStageCreateInfo> shader_stage_create_infos;
464+
shader_stage_create_infos.reserve(num_pipelines);
465+
466+
std::vector<VkComputePipelineCreateInfo> create_infos;
467+
create_infos.reserve(num_pipelines);
468+
469+
for (auto& key : descriptors) {
470+
map_entries.push_back(key.specialization_constants.generate_map_entries());
471+
472+
specialization_infos.push_back(VkSpecializationInfo{
473+
key.specialization_constants.size(), // mapEntryCount
474+
map_entries.back().data(), // pMapEntries
475+
key.specialization_constants.data_nbytes(), // dataSize
476+
key.specialization_constants.data(), // pData
477+
});
478+
479+
shader_stage_create_infos.push_back(VkPipelineShaderStageCreateInfo{
480+
VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType
481+
nullptr, // pNext
482+
0u, // flags
483+
VK_SHADER_STAGE_COMPUTE_BIT, // stage
484+
key.shader_module, // module
485+
"main", // pName
486+
&specialization_infos.back(), // pSpecializationInfo
487+
});
488+
489+
create_infos.push_back(VkComputePipelineCreateInfo{
490+
VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // sType
491+
nullptr, // pNext
492+
0u, // flags
493+
shader_stage_create_infos.back(), // stage
494+
key.pipeline_layout, // layout
495+
VK_NULL_HANDLE, // basePipelineHandle
496+
0u, // basePipelineIndex
497+
});
498+
}
499+
500+
VK_CHECK(vkCreateComputePipelines(
501+
device_,
502+
pipeline_cache_,
503+
create_infos.size(),
504+
create_infos.data(),
505+
nullptr,
506+
pipelines.data()));
507+
508+
uint32_t i = 0;
509+
for (auto& key : descriptors) {
510+
cache_.insert({key, ComputePipelineCache::Value(device_, pipelines[i])});
511+
++i;
512+
}
513+
}
514+
447515
VkPipeline ComputePipelineCache::retrieve(
448516
const ComputePipelineCache::Key& key) {
449517
std::lock_guard<std::mutex> lock(cache_mutex_);
450-
451518
auto it = cache_.find(key);
452519
if (cache_.cend() == it) {
520+
// Pipelines for dynamic shapes must be created individually
453521
it = cache_
454522
.insert(
455523
{key,

backends/vulkan/runtime/vk_api/Pipeline.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include <mutex>
2121
#include <unordered_map>
22+
#include <unordered_set>
2223

2324
#define SV(x) ::vkcompute::vkapi::SpecVar(x)
2425

@@ -158,6 +159,8 @@ class ComputePipeline final {
158159
SpecVarList specialization_constants;
159160
};
160161

162+
explicit ComputePipeline(VkDevice device, VkPipeline handle);
163+
161164
explicit ComputePipeline(
162165
VkDevice device,
163166
const Descriptor& descriptor,
@@ -185,6 +188,10 @@ class ComputePipeline final {
185188
// does not allow for move assignment. The swap function will
186189
// be used in the hash map.
187190
friend void swap(ComputePipeline& lhs, ComputePipeline& rhs) noexcept;
191+
192+
friend bool operator==(
193+
const ComputePipeline::Descriptor& _1,
194+
const ComputePipeline::Descriptor& _2);
188195
};
189196

190197
class PipelineLayoutCache final {
@@ -293,6 +300,7 @@ class ComputePipelineCache final {
293300
const std::string cache_data_path_;
294301

295302
public:
303+
void create_pipelines(const std::unordered_set<Key, Hasher>&);
296304
VkPipeline retrieve(const Key&);
297305
void purge();
298306
};

0 commit comments

Comments
 (0)