Skip to content

Commit d8147f2

Browse files
committed
Remove multiple shards
1 parent 0be54f7 commit d8147f2

File tree

1 file changed

+10
-93
lines changed

1 file changed

+10
-93
lines changed

llama.cpp

Lines changed: 10 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -376,18 +376,11 @@ struct llama_load_tensor_shard {
376376
}
377377
};
378378

379-
enum llama_split_type {
380-
SPLIT_NONE,
381-
SPLIT_BY_COLUMNS,
382-
SPLIT_BY_ROWS
383-
};
384-
385379
struct llama_load_tensor {
386-
std::vector<llama_load_tensor_shard> shards;
380+
llama_load_tensor_shard first_shard;
387381

388382
std::string name;
389383
enum ggml_type type = GGML_TYPE_F32;
390-
llama_split_type split_type = SPLIT_NONE;
391384
std::vector<uint32_t> ne;
392385
size_t size;
393386
struct ggml_tensor * ggml_tensor = NULL;
@@ -397,58 +390,16 @@ struct llama_load_tensor {
397390

398391
void calc_all() {
399392
calc_type();
400-
calc_split_type();
401393
calc_ne();
402394
calc_size();
403395
}
404396

405397
void calc_type() {
406-
const auto & first_shard = shards.at(0);
407-
for (const auto & shard : shards) {
408-
if (shard.type != first_shard.type) {
409-
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
410-
}
411-
}
412398
type = first_shard.type;
413399
}
414400

415-
void calc_split_type() {
416-
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
417-
shards.size() == 1) { // only one file?
418-
split_type = SPLIT_NONE;
419-
} else if (name.find("tok_embeddings.") == 0 ||
420-
name.find(".attention.wo.weight") != std::string::npos ||
421-
name.find(".feed_forward.w2.weight") != std::string::npos) {
422-
split_type = SPLIT_BY_COLUMNS;
423-
} else {
424-
split_type = SPLIT_BY_ROWS;
425-
}
426-
}
427-
428401
void calc_ne() {
429-
const auto & first_shard = shards.at(0);
430-
for (const auto & shard : shards) {
431-
if (shard.ne != first_shard.ne) {
432-
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
433-
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
434-
}
435-
}
436402
ne = first_shard.ne;
437-
LLAMA_ASSERT(shards.size() <= UINT32_MAX);
438-
uint32_t n_shards = (uint32_t) shards.size();
439-
switch (split_type) {
440-
case SPLIT_NONE:
441-
ne = first_shard.ne;
442-
break;
443-
case SPLIT_BY_COLUMNS:
444-
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
445-
first_shard.ne[1]};
446-
break;
447-
case SPLIT_BY_ROWS:
448-
ne = {first_shard.ne[0],
449-
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
450-
break;
451-
}
452403
}
453404

454405
void calc_size() {
@@ -589,7 +540,7 @@ struct llama_file_loader {
589540
idx = tensors_map.tensors.size() - 1;
590541
tensors_map.name_to_idx.emplace(name, idx);
591542
}
592-
tensors_map.tensors.at(idx).shards.push_back(shard);
543+
tensors_map.tensors.at(idx).first_shard = shard;
593544
}
594545
}
595546
};
@@ -693,10 +644,8 @@ struct llama_model_loader {
693644

694645
bool alignment_prevents_mmap() {
695646
for (const llama_load_tensor & lt : tensors_map.tensors) {
696-
for (const llama_load_tensor_shard & shard : lt.shards) {
697-
if (shard.file_off & 3) {
698-
return true;
699-
}
647+
if (lt.first_shard.file_off & 3) {
648+
return true;
700649
}
701650
}
702651
return false;
@@ -708,7 +657,7 @@ struct llama_model_loader {
708657
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
709658
}
710659
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
711-
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
660+
return file_loaders.at(0)->hparams.n_embd / lt.first_shard.ne.at(0);
712661
}
713662

714663
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -830,45 +779,13 @@ struct llama_model_loader {
830779

831780
void load_data_for(llama_load_tensor & lt) {
832781
if (use_mmap) {
833-
LLAMA_ASSERT(lt.shards.size() == 1);
834-
lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
835-
} else if (lt.split_type == SPLIT_NONE) {
836-
llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
837-
file.seek(lt.shards.at(0).file_off, SEEK_SET);
782+
lt.data = (uint8_t *) mapping->addr + lt.first_shard.file_off;
783+
} else {
784+
llama_file & file = file_loaders.at(lt.first_shard.file_idx)->file;
785+
file.seek(lt.first_shard.file_off, SEEK_SET);
838786
file.read_raw(lt.data, lt.size);
839-
} else if (lt.split_type == SPLIT_BY_ROWS) {
840-
size_t offset = 0;
841-
for (llama_load_tensor_shard & shard : lt.shards) {
842-
llama_file & file = file_loaders.at(shard.file_idx)->file;
843-
file.seek(shard.file_off, SEEK_SET);
844-
file.read_raw(lt.data + offset, shard.size);
845-
offset += shard.size;
846-
}
847-
LLAMA_ASSERT(offset == lt.size);
848-
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
849-
// Let's load the data into temporary buffers to ensure the OS performs large loads.
850-
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
851-
for (size_t i = 0; i < lt.shards.size(); i++) {
852-
llama_load_tensor_shard & shard = lt.shards.at(i);
853-
llama_file & file = file_loaders.at(shard.file_idx)->file;
854-
file.seek(shard.file_off, SEEK_SET);
855-
tmp_bufs.at(i).resize(shard.size);
856-
file.read_raw(tmp_bufs.at(i).addr, shard.size);
857-
}
858-
// Then reshape.
859-
size_t num_rows = lt.ne.at(1);
860-
size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
861-
size_t out_offset = 0;
862-
for (size_t row = 0; row < num_rows; row++) {
863-
for (llama_buffer & tmp_buf : tmp_bufs) {
864-
memcpy(lt.data + out_offset,
865-
tmp_buf.addr + row * per_shard_row_size,
866-
per_shard_row_size);
867-
out_offset += per_shard_row_size;
868-
}
869-
}
870-
LLAMA_ASSERT(out_offset == lt.size);
871787
}
788+
872789
if (0) {
873790
print_checksum(lt);
874791
}

0 commit comments

Comments
 (0)