ggml-org
diff --git a/‎Makefile
Lines changed: 5 additions & 5 deletions b/‎Makefile
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/mulmat-tune/README.md
Lines changed: 8 additions & 8 deletions b/‎examples/mulmat-tune/README.md
Lines changed: 8 additions & 8 deletions
@@ -222,7 +222,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
-OBJS += mulmat-tune.o
+OBJS += ggml-tune.o
 
 #
 # Build library
@@ -290,14 +290,14 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-mulmat-tune.o: examples/mulmat-tune/mulmat-tune.c
+ggml-tune.o: ggml-tune.c ggml-tune.h
 	$(CC)  $(CFLAGS) -c $< -o $@
 
-mulmat-tune: examples/mulmat-tune/mulmat-tune-tool.c ggml.o $(OBJS)
+mulmat-tune: examples/mulmat-tune/mulmat-tune.c ggml.o $(OBJS)
 	$(CC)  $(CFLAGS) $^ -o mulmat-tune $(LDFLAGS)
 
-test-mulmat-tune: tests/test-mulmat-tune.c ggml.o $(OBJS)
-	$(CC)  $(CFLAGS) $^ -o tests/test-mulmat-tune $(LDFLAGS)
+test-ggml-tune: tests/test-ggml-tune.c ggml.o $(OBJS)
+	$(CC)  $(CFLAGS) $^ -o tests/test-ggml-tune $(LDFLAGS)
 
 .PHONY: tests clean
 tests:
 
@@ -145,7 +145,7 @@ void ggml_internal_compute_forward_mul_mat(
          struct ggml_tensor * dst);
 
 
-// examples/mulmat-tune/mulmat-tune.h
+// ggml-tune.h
 
 struct ggml_task_stage {
     /*enum ggml_backend*/ int backend;
@@ -250,7 +250,7 @@ Terms:
 - #1: the `q_f32` BLAS implementation in master (when defined either
       `GGML_USE_ACCELERATE` or `GGML_USE_OPENBLAS`)
 - #2: split `#1` into `INIT` and `COMPUTE`. Where INIT runs de-quantization
-      with N threads, COMPUTE in with Accelerate with 1 thread.
+      with N threads, COMPUTE with BLAS and 1 thread.
 
 The `#0_0` is read as "profile #0, stage 0 (INIT)", the `#0_1` is read as
 "profile #0 stage 1 (COMPUTE)". "#0__" is read as total time.
@@ -521,7 +521,7 @@ total_time = init_time / nth + compute_time
 For any given M/N/K/n_threads, we can interpolate time for M between the nearest
 two `M`s whenever is in bench range or not.
 
-See `ggml_mulmat_tune_estimate_time()` in file [mulmat-tune.c](./mulmat-tune.c)
+See `ggml_mulmat_tune_estimate_time()` in file [ggml-tune.c](../../ggmlt-tune.c)
 for how to estimate time.
 
 The linear interpolate (t = aM + b) should works well for N/K that are both > 0.
@@ -546,17 +546,17 @@ simple cache is used, the overhead MAY goes down to about 10 us.
 
 ## Wait-Notify Overhead
 
-Each call is about 10 us, may vary 5x. Since every mul_mat that run with-gpu
+Each call is about 10 us, may vary 5x. Since every mul_mat that run with GPU/BLAS
 takes several ms to hundreds of ms, and the average boost is large, so the
 wait-notify overhead is acceptable.
 
 ## High Level Guide to Code Review
 
 **Major Changes**
 
-- examples/mulmat-tune provides the tool, data file format and data
-  structure/APIs for graph compute. Some of them are expected be integrated into
-  ggml.c/ggml.h.
+- examples/mulmat-tune provides the bench tools
+- ggml-tune.{c,h}: data file format and data structure/APIs for graph compute.
+  Some of them are expected be integrated into ggml.c/ggml.h.
 - ggml.h: exposes a test function for mulmat-tune-bench.c; new fields and structs.
 - ggml.c: new threading framework, update to `ggml_compute_forward_mul_mat()`.
   updated BLAS codes for the new task config/profile; split COMPUTE into INIT +
@@ -571,7 +571,7 @@ I assume we agree that:
 
 1. Discuss and evaluate, determine whether this pull request make sense.
 2. Fix and enhance, and rebase onto latest master.
-3. If it useful and being accepted, then split in to smaller pull requests.
+3. If it's useful and being accepted, then split in to smaller pull requests.
 
 Here is the possible merge steps I think: