@@ -92,36 +92,28 @@ void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
92
92
}
93
93
94
94
void llm_graph_input_out_ids::set_input (const llama_ubatch * ubatch) {
95
- if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
96
- // GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
95
+ GGML_ASSERT (out_ids);
97
96
98
- if (!out_ids) {
99
- LLAMA_LOG_WARN (" %s: 'out_ids' is not created\n " , __func__);
100
- } else {
101
- const int64_t n_tokens = ubatch->n_tokens ;
97
+ const int64_t n_tokens = ubatch->n_tokens ;
102
98
103
- GGML_ASSERT (ggml_backend_buffer_is_host (out_ids->buffer ));
104
- int32_t * data = (int32_t *) out_ids->data ;
99
+ GGML_ASSERT (ggml_backend_buffer_is_host (out_ids->buffer ));
100
+ int32_t * data = (int32_t *) out_ids->data ;
105
101
106
- if (n_outputs == n_tokens) {
107
- for (int i = 0 ; i < n_tokens; ++i) {
108
- data[i] = i;
109
- }
110
- } else if (ubatch->output ) {
111
- int32_t n_outputs = 0 ;
112
- for (int i = 0 ; i < n_tokens; ++i) {
113
- if (ubatch->output [i]) {
114
- data[n_outputs++] = i;
115
- }
116
- }
117
- // the graph needs to have been passed the correct number of outputs
118
- GGML_ASSERT (n_outputs == n_outputs);
119
- } else if (n_outputs == 1 ) {
120
- // only keep last output
121
- data[0 ] = n_tokens - 1 ;
122
- } else {
123
- GGML_ASSERT (n_outputs == 0 );
124
- }
102
+ if (n_outputs == n_tokens) {
103
+ for (int i = 0 ; i < n_tokens; ++i) {
104
+ data[i] = i;
105
+ }
106
+
107
+ return ;
108
+ }
109
+
110
+ GGML_ASSERT (ubatch->output );
111
+
112
+ int n_outputs = 0 ;
113
+
114
+ for (int i = 0 ; i < n_tokens; ++i) {
115
+ if (ubatch->output [i]) {
116
+ data[n_outputs++] = i;
125
117
}
126
118
}
127
119
}
@@ -874,6 +866,14 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
874
866
}
875
867
876
868
ggml_tensor * llm_graph_context::build_inp_out_ids () const {
869
+ // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
870
+ // but this would make the graph topology depend on the number of output tokens, which can interere with
871
+ // features that require constant topology such as pipline parallelism
872
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
873
+ // if (n_outputs < n_tokens) {
874
+ // return nullptr;
875
+ // }
876
+
877
877
auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
878
878
879
879
auto & cur = inp->out_ids ;
0 commit comments