pytorch · swolchok · Sep 11, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/LICENSE.md
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Alexandr Andoni, Piotr Indyk, Thijs Laarhoven,
+Ilya Razenshteyn, Ludwig Schmidt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile b/extension/llm/custom_ops/spinquant/third-party/FFHT/Makefile
@@ -0,0 +1,21 @@
+CC = gcc
+CFLAGS = -O3 -march=native -std=c99 -pedantic -Wall -Wextra -Wshadow -Wpointer-arith -Wcast-qual -Wstrict-prototypes -Wmissing-prototypes
+
+all: test_float test_double fast_copy.o fht.o
+
+OBJ := fast_copy.o fht.o
+
+%.o: %.c
+	$(CC) $< -o $@ -c $(CFLAGS)
+
+test_%: test_%.c $(OBJ)
+	$(CC) $< $(OBJ) -o $@ $(CFLAGS)
+
+test_double_header_only: test_double_header_only.c
+	$(CC) $< -o $@ $(CFLAGS)
+
+test_float_header_only: test_double_header_only.c
+	$(CC) $< -o $@ $(CFLAGS)
+
+clean:
+	rm -f test_float test_double test_float_header_only test_double_header_only $(OBJ)
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
@@ -0,0 +1,115 @@
+# Fast Fast Hadamard Transform
+
+FFHT (Fast Fast Hadamard Transform) is a library that provides a heavily
+optimized C99 implementation of the Fast Hadamard Transform. FFHT also provides
+a thin Python wrapper that allows to perform the Fast Hadamard Transform on
+one-dimensional [NumPy](http://www.numpy.org/) arrays.
+
+The Hadamard Transform is a linear orthogonal map defined on real vectors whose
+length is a _power of two_. For the precise definition, see the
+[Uncyclopedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The
+Hadamard Transform has been recently used a lot in various machine learning
+and numerical algorithms.
+
+FFHT uses [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions)
+to speed up the computation.
+
+The header file `fht.h` exports two functions: `int fht_float(float *buf, int
+log_n)` and `int fht_double(double *buf, int log_n)`. The
+only difference between them is the type of vector entries. So, in what follows,
+we describe how the version for floats `fht_float` works.
+
+The function `fht_float` takes two parameters:
+
+* `buf` is a pointer to the data on which one needs to perform the Fast
+Hadamard Transform.
+* `log_n` is the binary logarithm of the length of `buffer`.
+That is, the length is equal to `2^log_n`.
+
+The return value is -1 if the input is invalid and is zero otherwise.
+
+A header-only version of the library is provided in `fht_header_only.h`.
+
+In addition to the Fast Hadamard Transform, we provide two auxiliary programs:
+`test_float` and `test_double`, which are implemented in C99. The exhaustively
+test and benchmark the library.
+
+FFHT has been tested on 64-bit versions of Linux, OS X and Windows (the latter
+is via Cygwin).
+
+To install the Python package, run `python setup.py install`. The script
+`example.py` shows how to use FFHT from Python.
+
+## Benchmarks
+
+Below are the times for the Fast Hadamard Transform for vectors of
+various lengths. The benchmarks were run on a machine with Intel
+Core&nbsp;i7-6700K and 2133 MHz DDR4 RAM. We compare FFHT,
+[FFTW 3.3.6](http://fftw.org/), and
+[fht](https://github.com/nbarbey/fht) by
+[Nicolas Barbey](https://github.com/nbarbey).
+
+Let us stress that FFTW is a great versatile tool, and the authors of FFTW did
+not try to optimize the performace of the Fast Hadamard Transform. On the other
+hand, FFHT does one thing (the Fast Hadamard Transform), but does it extremely
+well.
+
+Vector size | FFHT (float) | FFHT (double) | FFTW 3.3.6 (float) | FFTW 3.3.6 (double) | fht (float) | fht (double)
+:---: | :---: | :---: | :---: | :---: | :---: | :---:
+2<sup>10</sup> | 0.31 us | 0.49 us | 4.48 us | 7.72 us | 17.4 us | 19.3 us
+2<sup>20</sup> | 0.68 ms | 1.39 ms | 8.81 ms | 17.07 ms | 29.8 ms | 35.0 ms
+2<sup>27</sup> | 0.22 s | 0.50 s | 2.08 s | 3.57 s | 6.89 s | 7.49 s
+
+## Troubleshooting
+
+For some versions of OS X the native `clang` compiler (that mimicks `gcc`) may
+not recognize the availability of AVX. A solution for this problem is to use a
+genuine `gcc` (say from [Homebrew](http://brew.sh/)) or to use `-march=corei7-avx`
+instead of `-march=native` for compiler flags.
+
+A symptom of the above happening is the undefined macros `__AVX__`.
+
+## Related Work
+
+FFHT has been created as a part of
+[FALCONN](https://github.com/falconn-lib/falconn): a library for similarity
+search over high-dimensional data. FALCONN's underlying algorithms are described
+and analyzed in the following research paper:
+
+> Alexandr Andoni, Piotr Indyk, Thijs Laarhoven, Ilya Razenshteyn and Ludwig
+> Schmidt, "Practical and Optimal LSH for Angular Distance", NIPS 2015, full
+> version available at [arXiv:1509.02897](http://arxiv.org/abs/1509.02897)
+
+This is the right paper to cite, if you use FFHT for your research projects.
+
+## Acknowledgments
+
+We thank Ruslan Savchenko for useful discussions.
+
+Thanks to:
+
+* Clement Canonne
+* Michal Forisek
+* Rati Gelashvili
+* Daniel Grier
+* Dhiraj Holden
+* Justin Holmgren
+* Aleksandar Ivanovic
+* Vladislav Isenbaev
+* Jacob Kogler
+* Ilya Kornakov
+* Anton Lapshin
+* Rio LaVigne
+* Oleg Martynov
+* Linar Mikeev
+* Cameron Musco
+* Sam Park
+* Sunoo Park
+* Amelia Perry
+* Andrew Sabisch
+* Abhishek Sarkar
+* Ruslan Savchenko
+* Vadim Semenov
+* Arman Yessenamanov
+
+for helping us with testing FFHT.
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_2.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_2.c
@@ -0,0 +1,128 @@
+#include <Python.h>
+#include <numpy/arrayobject.h>
+#include "fht.h"
+
+#define UNUSED(x) (void)(x)
+
+static char module_docstring[] =
+    "A C extension that computes the Fast Hadamard Transform";
+static char fht_docstring[] =
+    "Compute the Fast Hadamard Transform (FHT) for a given "
+    "one-dimensional NumPy array.\n\n"
+    "The Hadamard Transform is a linear orthogonal map defined on real vectors "
+    "whose length is a _power of two_. For the precise definition, see the "
+    "[Uncyclopedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The "
+    "Hadamard Transform has been recently used a lot in various machine "
+    "learning "
+    "and numerical algorithms.\n\n"
+    "The implementation uses "
+    "[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) "
+    "to speed up the computation. If AVX is not supported on your machine, "
+    "a simpler implementation without (explicit) vectorization is used.\n\n"
+    "The function takes two parameters:\n\n"
+    "* `buffer` is a NumPy array which is being transformed. It must be a "
+    "one-dimensional array with `dtype` equal to `float32` or `float64` (the "
+    "former is recommended unless you need high accuracy) and of size being a "
+    "power "
+    "of two. If your CPU supports AVX, then `buffer` must be aligned to 32 "
+    "bytes. "
+    "To allocate such an aligned buffer, use the function `created_aligned` "
+    "from this "
+    "module.\n"
+    "* `chunk` is a positive integer that controls when the implementation "
+    "switches "
+    "from recursive to iterative algorithm. The overall algorithm is "
+    "recursive, but as "
+    "soon as the vector becomes no longer than `chunk`, the iterative "
+    "algorithm is "
+    "invoked. For technical reasons, `chunk` must be at least 8. A good choice "
+    "is to "
+    "set `chunk` to 1024. But to fine-tune the performance one should use a "
+    "program "
+    "`best_chunk` supplied with the library.\n";
+
+static PyObject *ffht_fht(PyObject *self, PyObject *args);
+
+static PyMethodDef module_methods[] = {
+    {"fht", ffht_fht, METH_VARARGS, fht_docstring}, {NULL, NULL, 0, NULL}};
+
+PyMODINIT_FUNC initffht(void);
+
+PyMODINIT_FUNC initffht(void) {
+  PyObject *m = Py_InitModule3("ffht", module_methods, module_docstring);
+  if (!m) return;
+
+  import_array();
+}
+
+static PyObject *ffht_fht(PyObject *self, PyObject *args) {
+  UNUSED(self);
+
+  PyObject *buffer_obj;
+
+  if (!PyArg_ParseTuple(args, "O", &buffer_obj)) {
+    return NULL;
+  }
+
+  PyArray_Descr *dtype;
+  int ndim;
+  npy_intp dims[NPY_MAXDIMS];
+  PyArrayObject *arr = NULL;
+
+  if (PyArray_GetArrayParamsFromObject(buffer_obj, NULL, 1, &dtype, &ndim, dims,
+                                       &arr, NULL) < 0) {
+    return NULL;
+  }
+
+  if (arr == NULL) {
+    PyErr_SetString(PyExc_TypeError, "not a numpy array");
+    return NULL;
+  }
+
+  dtype = PyArray_DESCR(arr);
+
+  if (dtype->type_num != NPY_FLOAT && dtype->type_num != NPY_DOUBLE) {
+    PyErr_SetString(PyExc_TypeError, "array must consist of floats or doubles");
+    Py_DECREF(arr);
+    return NULL;
+  }
+
+  if (PyArray_NDIM(arr) != 1) {
+    PyErr_SetString(PyExc_TypeError, "array must be one-dimensional");
+    Py_DECREF(arr);
+    return NULL;
+  }
+
+  int n = PyArray_DIM(arr, 0);
+
+  if (n == 0 || (n & (n - 1))) {
+    PyErr_SetString(PyExc_ValueError, "array's length must be a power of two");
+    Py_DECREF(arr);
+    return NULL;
+  }
+
+  int log_n = 0;
+  while ((1 << log_n) < n) {
+    ++log_n;
+  }
+
+  void *raw_buffer = PyArray_DATA(arr);
+  int res;
+  if (dtype->type_num == NPY_FLOAT) {
+    float *buffer = (float *)raw_buffer;
+    res = fht_float(buffer, log_n);
+  } else {
+    double *buffer = (double *)raw_buffer;
+    res = fht_double(buffer, log_n);
+  }
+
+  if (res) {
+    PyErr_SetString(PyExc_RuntimeError, "FHT did not work properly");
+    Py_DECREF(arr);
+    return NULL;
+  }
+
+  Py_DECREF(arr);
+
+  return Py_BuildValue("");
+}