Skip to content

Commit 812939a

Browse files
authored
model : more uniform output id handling (#14275)
* model : more uniform output id handling ggml-ci * cont : revert n_outputs < n_tokens optimization ggml-ci * cont : fix out_ids initialization ggml-ci
1 parent 4c9fdfb commit 812939a

File tree

2 files changed

+455
-438
lines changed

2 files changed

+455
-438
lines changed

src/llama-graph.cpp

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -92,36 +92,28 @@ void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
9292
}
9393

9494
void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
95-
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
96-
//GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
95+
GGML_ASSERT(out_ids);
9796

98-
if (!out_ids) {
99-
LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
100-
} else {
101-
const int64_t n_tokens = ubatch->n_tokens;
97+
const int64_t n_tokens = ubatch->n_tokens;
10298

103-
GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
104-
int32_t * data = (int32_t *) out_ids->data;
99+
GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
100+
int32_t * data = (int32_t *) out_ids->data;
105101

106-
if (n_outputs == n_tokens) {
107-
for (int i = 0; i < n_tokens; ++i) {
108-
data[i] = i;
109-
}
110-
} else if (ubatch->output) {
111-
int32_t n_outputs = 0;
112-
for (int i = 0; i < n_tokens; ++i) {
113-
if (ubatch->output[i]) {
114-
data[n_outputs++] = i;
115-
}
116-
}
117-
// the graph needs to have been passed the correct number of outputs
118-
GGML_ASSERT(n_outputs == n_outputs);
119-
} else if (n_outputs == 1) {
120-
// only keep last output
121-
data[0] = n_tokens - 1;
122-
} else {
123-
GGML_ASSERT(n_outputs == 0);
124-
}
102+
if (n_outputs == n_tokens) {
103+
for (int i = 0; i < n_tokens; ++i) {
104+
data[i] = i;
105+
}
106+
107+
return;
108+
}
109+
110+
GGML_ASSERT(ubatch->output);
111+
112+
int n_outputs = 0;
113+
114+
for (int i = 0; i < n_tokens; ++i) {
115+
if (ubatch->output[i]) {
116+
data[n_outputs++] = i;
125117
}
126118
}
127119
}
@@ -874,6 +866,14 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
874866
}
875867

876868
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
869+
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
870+
// but this would make the graph topology depend on the number of output tokens, which can interere with
871+
// features that require constant topology such as pipline parallelism
872+
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
873+
//if (n_outputs < n_tokens) {
874+
// return nullptr;
875+
//}
876+
877877
auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
878878

879879
auto & cur = inp->out_ids;

0 commit comments

Comments
 (0)