Skip to content

enable CPU HBM #2603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,11 @@ endif()

# ggml

if (GGML_USE_CPU_HBM)
add_definitions(-DGGML_USE_CPU_HBM)
find_library(memkind memkind REQUIRED)
endif()

add_library(ggml OBJECT
ggml.c
ggml.h
Expand All @@ -572,6 +577,9 @@ add_library(ggml OBJECT
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
if (GGML_USE_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()

add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
if (BUILD_SHARED_LIBS)
Expand Down
20 changes: 19 additions & 1 deletion ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ typedef void * thread_ret_t;
#include <sys/stat.h>
#include <unistd.h>

#endif
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif

// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
Expand Down Expand Up @@ -192,8 +195,14 @@ typedef void * thread_ret_t;
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else
inline static void * ggml_aligned_malloc(size_t size) {
if (size == 0) {
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
return NULL;
}
void * aligned_memory = NULL;
#ifdef GGML_USE_METAL
#ifdef GGML_USE_CPU_HBM
int result = hbw_posix_memalign(&aligned_memory, 16, size);
#elif GGML_USE_METAL
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
Expand All @@ -215,8 +224,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
return aligned_memory;
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
#ifdef GGML_USE_CPU_HBM
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
#else
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
#endif

#define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
Expand Down Expand Up @@ -4566,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
return NULL;
}

// allow to call ggml_init with 0 size
if (params.mem_size == 0) {
params.mem_size = GGML_MEM_ALIGN;
}

const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);

*ctx = (struct ggml_context) {
Expand Down
12 changes: 11 additions & 1 deletion llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
}
s = std::move(result);
}
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif

static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
Expand Down Expand Up @@ -450,6 +453,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
#elif GGML_USE_METAL
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
# define llama_host_free(data) ggml_metal_host_free(data)
#elif GGML_USE_CPU_HBM
# define llama_host_malloc(n) hbw_malloc(n)
# define llama_host_free(data) if (data != NULL) hbw_free(data)
#else
# define llama_host_malloc(n) malloc(n)
# define llama_host_free(data) free(data)
Expand Down Expand Up @@ -1489,7 +1495,11 @@ struct llama_model_loader {
// allocate temp buffer if not using mmap
if (!use_mmap && cur->data == NULL) {
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
cur->data = malloc(ggml_nbytes(cur));
#ifdef GGML_USE_CPU_HBM
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
#else
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
#endif
}

load_data_for(cur);
Expand Down