Skip to content

Commit 5031c50

Browse files
authored
Merge branch 'master' into betterlogs
2 parents e99f039 + 04f4b1e commit 5031c50

35 files changed

+3275
-1342
lines changed

.devops/full-rocm.Dockerfile

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
ARG UBUNTU_VERSION=22.04
2+
3+
# This needs to generally match the container host's environment.
4+
ARG ROCM_VERSION=5.6
5+
6+
# Target the CUDA build image
7+
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
8+
9+
FROM ${BASE_ROCM_DEV_CONTAINER} as build
10+
11+
# Unless otherwise specified, we make a fat build.
12+
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
13+
# This is mostly tied to rocBLAS supported archs.
14+
ARG ROCM_DOCKER_ARCH=\
15+
gfx803 \
16+
gfx900 \
17+
gfx906 \
18+
gfx908 \
19+
gfx90a \
20+
gfx1010 \
21+
gfx1030 \
22+
gfx1100 \
23+
gfx1101 \
24+
gfx1102
25+
26+
COPY requirements.txt requirements.txt
27+
28+
RUN pip install --upgrade pip setuptools wheel \
29+
&& pip install -r requirements.txt
30+
31+
WORKDIR /app
32+
33+
COPY . .
34+
35+
# Set nvcc architecture
36+
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
37+
# Enable ROCm
38+
ENV LLAMA_HIPBLAS=1
39+
ENV CC=/opt/rocm/llvm/bin/clang
40+
ENV CXX=/opt/rocm/llvm/bin/clang++
41+
42+
RUN make
43+
44+
ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/main-rocm.Dockerfile

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
ARG UBUNTU_VERSION=22.04
2+
3+
# This needs to generally match the container host's environment.
4+
ARG ROCM_VERSION=5.6
5+
6+
# Target the CUDA build image
7+
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
8+
9+
FROM ${BASE_ROCM_DEV_CONTAINER} as build
10+
11+
# Unless otherwise specified, we make a fat build.
12+
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
13+
# This is mostly tied to rocBLAS supported archs.
14+
ARG ROCM_DOCKER_ARCH=\
15+
gfx803 \
16+
gfx900 \
17+
gfx906 \
18+
gfx908 \
19+
gfx90a \
20+
gfx1010 \
21+
gfx1030 \
22+
gfx1100 \
23+
gfx1101 \
24+
gfx1102
25+
26+
COPY requirements.txt requirements.txt
27+
28+
RUN pip install --upgrade pip setuptools wheel \
29+
&& pip install -r requirements.txt
30+
31+
WORKDIR /app
32+
33+
COPY . .
34+
35+
# Set nvcc architecture
36+
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
37+
# Enable ROCm
38+
ENV LLAMA_HIPBLAS=1
39+
ENV CC=/opt/rocm/llvm/bin/clang
40+
ENV CXX=/opt/rocm/llvm/bin/clang++
41+
42+
RUN make
43+
44+
ENTRYPOINT [ "/app/main" ]

.dockerignore

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,7 @@
55
.vscode/
66
.DS_Store
77

8-
build/
9-
build-em/
10-
build-debug/
11-
build-release/
12-
build-static/
13-
build-no-accel/
14-
build-sanitize-addr/
15-
build-sanitize-thread/
8+
build*/
169

1710
models/*
1811

.gitignore

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,7 @@
1717
.vs/
1818
.vscode/
1919

20-
build/
21-
build-em/
22-
build-debug/
23-
build-release/
24-
build-ci-debug/
25-
build-ci-release/
26-
build-static/
27-
build-cublas/
28-
build-opencl/
29-
build-metal/
30-
build-mpi/
31-
build-no-accel/
32-
build-sanitize-addr/
33-
build-sanitize-thread/
20+
build*/
3421
out/
3522
tmp/
3623

@@ -61,6 +48,7 @@ compile_commands.json
6148
CMakeSettings.json
6249

6350
__pycache__
51+
dist
6452

6553
zig-out/
6654
zig-cache/
@@ -71,7 +59,6 @@ perf-*.txt
7159

7260
examples/jeopardy/results.txt
7361

74-
pyproject.toml
7562
poetry.lock
7663
poetry.toml
7764

CMakeLists.txt

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
7474
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
7575
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
7676
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
77+
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
7778
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7879
option(LLAMA_METAL "llama: use Metal" OFF)
7980
option(LLAMA_MPI "llama: use MPI" OFF)
@@ -352,6 +353,43 @@ if (LLAMA_CLBLAST)
352353
endif()
353354
endif()
354355

356+
if (LLAMA_HIPBLAS)
357+
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
358+
359+
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
360+
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
361+
endif()
362+
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
363+
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
364+
endif()
365+
366+
find_package(hip)
367+
find_package(hipblas)
368+
find_package(rocblas)
369+
370+
if (${hipblas_FOUND} AND ${hip_FOUND})
371+
message(STATUS "HIP and hipBLAS found")
372+
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
373+
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
374+
if (LLAMA_CUDA_FORCE_DMMV)
375+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
376+
endif()
377+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
378+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
379+
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
380+
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
381+
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
382+
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
383+
384+
if (LLAMA_STATIC)
385+
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
386+
endif()
387+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
388+
else()
389+
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
390+
endif()
391+
endif()
392+
355393
if (LLAMA_ALL_WARNINGS)
356394
if (NOT MSVC)
357395
set(c_flags

Makefile

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,30 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
280280
$(CXX) $(CXXFLAGS) -c $< -o $@
281281
endif # LLAMA_CLBLAST
282282

283+
ifdef LLAMA_HIPBLAS
284+
ROCM_PATH ?= /opt/rocm
285+
HIPCC ?= $(ROCM_PATH)/bin/hipcc
286+
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
287+
LLAMA_CUDA_DMMV_X ?= 32
288+
LLAMA_CUDA_MMV_Y ?= 1
289+
LLAMA_CUDA_KQUANTS_ITER ?= 2
290+
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
291+
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
292+
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
293+
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
294+
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
295+
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
296+
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
297+
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
298+
HIPFLAGS += -DCC_TURING=1000000000
299+
ifdef LLAMA_CUDA_FORCE_DMMV
300+
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
301+
endif # LLAMA_CUDA_FORCE_DMMV
302+
OBJS += ggml-cuda.o
303+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
304+
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
305+
endif # LLAMA_HIPBLAS
306+
283307
ifdef LLAMA_METAL
284308
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
285309
CXXFLAGS += -DGGML_USE_METAL

README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,35 @@ Building the program with BLAS support may lead to some performance improvements
422422
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
423423
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
424424

425+
- #### hipBLAS
426+
427+
This provide BLAS acceleation on HIP supported GPU like AMD GPU.
428+
Make sure to have ROCm installed.
429+
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
430+
Windows support is coming soon...
431+
432+
- Using `make`:
433+
```bash
434+
make LLAMA_HIPBLAS=1
435+
```
436+
- Using `CMake`:
437+
```bash
438+
mkdir build
439+
cd build
440+
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
441+
cmake --build .
442+
```
443+
444+
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
445+
If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
446+
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
447+
448+
| Option | Legal values | Default | Description |
449+
|-------------------------|------------------------|---------|-------------|
450+
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
451+
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
452+
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
453+
425454
- #### CLBlast
426455
427456
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

ci/run.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
391391
ln -sfn ${mnt_models} ${SRC}/models-mnt
392392

393393
python3 -m pip install -r ${SRC}/requirements.txt
394+
python3 -m pip install --editable gguf-py
394395
fi
395396

396397
ret=0

common/common.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -635,9 +635,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
635635
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
636636
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
637637
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
638+
#ifdef GGML_USE_CUBLAS
638639
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
639-
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
640+
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
640641
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
642+
#endif // GGML_USE_CUBLAS
641643
#endif
642644
fprintf(stdout, " --mtest compute maximum memory usage\n");
643645
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ struct gpt_params {
3030
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3131
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
3232
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
33+
int32_t n_beams = 0; // if non-zero then use beam search of given width.
3334
float rope_freq_base = 10000.0f; // RoPE base frequency
3435
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
3536

convert.py

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ class Params:
105105
f_norm_eps: float
106106

107107
f_rope_freq_base: Optional[float] = None
108+
f_rope_scale: Optional[float] = None
108109

109110
ftype: Optional[GGMLFileType] = None
110111

@@ -160,13 +161,20 @@ def guessed(model: 'LazyModel') -> 'Params':
160161
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
161162
config = json.load(open(config_path))
162163

163-
n_vocab = config["vocab_size"]
164-
n_embd = config["hidden_size"]
165-
n_layer = config["num_hidden_layers"]
166-
n_ff = config["intermediate_size"]
167-
n_head = config["num_attention_heads"]
168-
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
169-
f_norm_eps = config["rms_norm_eps"]
164+
n_vocab = config["vocab_size"]
165+
n_embd = config["hidden_size"]
166+
n_layer = config["num_hidden_layers"]
167+
n_ff = config["intermediate_size"]
168+
n_head = config["num_attention_heads"]
169+
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
170+
f_norm_eps = config["rms_norm_eps"]
171+
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
172+
173+
rope_scaling = config.get("rope_scaling")
174+
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
175+
f_rope_scale = config["rope_scaling"].get("factor")
176+
else:
177+
f_rope_scale = None
170178

171179
n_mult = Params.find_n_mult(n_ff, n_embd)
172180

@@ -179,15 +187,17 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
179187
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
180188

181189
return Params(
182-
n_vocab = n_vocab,
183-
n_embd = n_embd,
184-
n_mult = n_mult,
185-
n_layer = n_layer,
186-
n_ctx = n_ctx,
187-
n_ff = n_ff,
188-
n_head = n_head,
189-
n_head_kv = n_head_kv,
190-
f_norm_eps = f_norm_eps,
190+
n_vocab = n_vocab,
191+
n_embd = n_embd,
192+
n_mult = n_mult,
193+
n_layer = n_layer,
194+
n_ctx = n_ctx,
195+
n_ff = n_ff,
196+
n_head = n_head,
197+
n_head_kv = n_head_kv,
198+
f_norm_eps = f_norm_eps,
199+
f_rope_freq_base = f_rope_freq_base,
200+
f_rope_scale = f_rope_scale,
191201
)
192202

193203
# LLaMA v2 70B params.json
@@ -771,6 +781,9 @@ def add_meta_arch(self, params: Params) -> None:
771781
if params.f_rope_freq_base:
772782
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
773783

784+
if params.f_rope_scale:
785+
self.gguf.add_rope_scale_linear(params.f_rope_scale)
786+
774787
if params.ftype:
775788
self.gguf.add_file_type(params.ftype)
776789

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ else()
2525
add_subdirectory(simple)
2626
add_subdirectory(embd-input)
2727
add_subdirectory(llama-bench)
28+
add_subdirectory(beam_search)
2829
if (LLAMA_METAL)
2930
add_subdirectory(metal)
3031
endif()

examples/beam_search/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
set(TARGET beam_search)
2+
add_executable(${TARGET} beam_search.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
6+
if(TARGET BUILD_INFO)
7+
add_dependencies(${TARGET} BUILD_INFO)
8+
endif()

0 commit comments

Comments
 (0)