Skip to content

Commit 17d2f47

Browse files
committed
Merge branch 'master' into layla-build
2 parents 00c2ccb + 66ba560 commit 17d2f47

File tree

183 files changed

+29288
-19081
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

183 files changed

+29288
-19081
lines changed

.clang-tidy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Checks: >
1212
-readability-implicit-bool-conversion,
1313
-readability-magic-numbers,
1414
-readability-uppercase-literal-suffix,
15+
-readability-simplify-boolean-expr,
1516
clang-analyzer-*,
1617
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
1718
performance-*,

.devops/full-cuda.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ COPY . .
2626

2727
# Set nvcc architecture
2828
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
29-
# Enable cuBLAS
30-
ENV LLAMA_CUBLAS=1
29+
# Enable CUDA
30+
ENV LLAMA_CUDA=1
3131

3232
RUN make
3333

.devops/llama-cpp-cublas.srpm.spec renamed to .devops/llama-cpp-cuda.srpm.spec

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
1313
# It is up to the user to install the correct vendor-specific support.
1414

15-
Name: llama.cpp-cublas
15+
Name: llama.cpp-cuda
1616
Version: %( date "+%%Y%%m%%d" )
1717
Release: 1%{?dist}
1818
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@@ -32,24 +32,24 @@ CPU inference for Meta's Lllama2 models using default options.
3232
%setup -n llama.cpp-master
3333

3434
%build
35-
make -j LLAMA_CUBLAS=1
35+
make -j LLAMA_CUDA=1
3636

3737
%install
3838
mkdir -p %{buildroot}%{_bindir}/
39-
cp -p main %{buildroot}%{_bindir}/llamacppcublas
40-
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
41-
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
39+
cp -p main %{buildroot}%{_bindir}/llamacppcuda
40+
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
41+
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
4242

4343
mkdir -p %{buildroot}/usr/lib/systemd/system
44-
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
44+
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
4545
[Unit]
4646
Description=Llama.cpp server, CPU only (no GPU support in this build).
4747
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
4848

4949
[Service]
5050
Type=simple
5151
EnvironmentFile=/etc/sysconfig/llama
52-
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
52+
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
5353
ExecReload=/bin/kill -s HUP $MAINPID
5454
Restart=never
5555

@@ -67,10 +67,10 @@ rm -rf %{buildroot}
6767
rm -rf %{_builddir}/*
6868

6969
%files
70-
%{_bindir}/llamacppcublas
71-
%{_bindir}/llamacppcublasserver
72-
%{_bindir}/llamacppcublassimple
73-
/usr/lib/systemd/system/llamacublas.service
70+
%{_bindir}/llamacppcuda
71+
%{_bindir}/llamacppcudaserver
72+
%{_bindir}/llamacppcudasimple
73+
/usr/lib/systemd/system/llamacuda.service
7474
%config /etc/sysconfig/llama
7575

7676
%pre

.devops/main-cuda.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ COPY . .
2020

2121
# Set nvcc architecture
2222
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
23-
# Enable cuBLAS
24-
ENV LLAMA_CUBLAS=1
23+
# Enable CUDA
24+
ENV LLAMA_CUDA=1
2525

2626
RUN make
2727

.devops/nix/package.nix

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44
config,
55
stdenv,
66
mkShell,
7+
runCommand,
78
cmake,
89
ninja,
910
pkg-config,
1011
git,
1112
python3,
1213
mpi,
13-
openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
14+
blas,
1415
cudaPackages,
1516
darwin,
1617
rocmPackages,
@@ -23,7 +24,7 @@
2324
useOpenCL
2425
useRocm
2526
useVulkan
26-
],
27+
] && blas.meta.available,
2728
useCuda ? config.cudaSupport,
2829
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
2930
useMpi ? false, # Increases the runtime closure size by ~700M
@@ -35,7 +36,8 @@
3536
# It's necessary to consistently use backendStdenv when building with CUDA support,
3637
# otherwise we get libstdc++ errors downstream.
3738
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
38-
enableStatic ? effectiveStdenv.hostPlatform.isStatic
39+
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
40+
precompileMetalShaders ? false
3941
}@inputs:
4042

4143
let
@@ -65,10 +67,15 @@ let
6567
strings.optionalString (suffices != [ ])
6668
", accelerated with ${strings.concatStringsSep ", " suffices}";
6769

70+
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
71+
6872
# TODO: package the Python in this repository in a Nix-like way.
6973
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
7074
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
7175
# https://peps.python.org/pep-0517/
76+
#
77+
# TODO: Package up each Python script or service appropriately, by making
78+
# them into "entrypoints"
7279
llama-python = python3.withPackages (
7380
ps: [
7481
ps.numpy
@@ -87,6 +94,11 @@ let
8794
]
8895
);
8996

97+
xcrunHost = runCommand "xcrunHost" {} ''
98+
mkdir -p $out/bin
99+
ln -s /usr/bin/xcrun $out/bin
100+
'';
101+
90102
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
91103
# separately
92104
darwinBuildInputs =
@@ -150,13 +162,18 @@ effectiveStdenv.mkDerivation (
150162
postPatch = ''
151163
substituteInPlace ./ggml-metal.m \
152164
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
153-
154-
# TODO: Package up each Python script or service appropriately.
155-
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
156-
# we could make those *.py into setuptools' entrypoints
157-
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
165+
substituteInPlace ./ggml-metal.m \
166+
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
158167
'';
159168

169+
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
170+
# `default.metallib` may be compiled with Metal compiler from XCode
171+
# and we need to escape sandbox on MacOS to access Metal compiler.
172+
# `xcrun` is used find the path of the Metal compiler, which is varible
173+
# and not on $PATH
174+
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
175+
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
176+
160177
nativeBuildInputs =
161178
[
162179
cmake
@@ -173,6 +190,8 @@ effectiveStdenv.mkDerivation (
173190
]
174191
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
175192
glibc.static
193+
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
194+
xcrunHost
176195
];
177196

178197
buildInputs =
@@ -181,6 +200,7 @@ effectiveStdenv.mkDerivation (
181200
++ optionals useMpi [ mpi ]
182201
++ optionals useOpenCL [ clblast ]
183202
++ optionals useRocm rocmBuildInputs
203+
++ optionals useBlas [ blas ]
184204
++ optionals useVulkan vulkanBuildInputs;
185205

186206
cmakeFlags =
@@ -191,7 +211,7 @@ effectiveStdenv.mkDerivation (
191211
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
192212
(cmakeBool "LLAMA_BLAS" useBlas)
193213
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
194-
(cmakeBool "LLAMA_CUBLAS" useCuda)
214+
(cmakeBool "LLAMA_CUDA" useCuda)
195215
(cmakeBool "LLAMA_HIPBLAS" useRocm)
196216
(cmakeBool "LLAMA_METAL" useMetalKit)
197217
(cmakeBool "LLAMA_MPI" useMpi)
@@ -216,14 +236,16 @@ effectiveStdenv.mkDerivation (
216236
# Should likely use `rocmPackages.clr.gpuTargets`.
217237
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
218238
]
219-
++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
220-
++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
239+
++ optionals useMetalKit [
240+
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
241+
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
242+
];
221243

222244
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
223245
# if they haven't been added yet.
224246
postInstall = ''
225-
mv $out/bin/main $out/bin/llama
226-
mv $out/bin/server $out/bin/llama-server
247+
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
248+
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
227249
mkdir -p $out/include
228250
cp $src/llama.h $out/include/
229251
'';

.devops/server-cuda.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ COPY . .
2020

2121
# Set nvcc architecture
2222
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
23-
# Enable cuBLAS
24-
ENV LLAMA_CUBLAS=1
23+
# Enable CUDA
24+
ENV LLAMA_CUDA=1
2525

2626
RUN make
2727

0 commit comments

Comments
 (0)