From 9b01073f17e844c4f1faccb800d77f349cb0f294 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <github@benfrederickson.com>
Date: Thu, 6 Jan 2022 12:26:22 -0800
Subject: [PATCH 1/5] Use faiss for gpu inference

---
 implicit/gpu/knn.cu                       | 201 ++++++++++------------
 implicit/gpu/knn.h                        |   2 +-
 implicit/gpu/matrix_factorization_base.py |  14 +-
 setup.py                                  |   2 +-
 4 files changed, 104 insertions(+), 115 deletions(-)

diff --git a/implicit/gpu/knn.cu b/implicit/gpu/knn.cu
index 5c3d7608..30b02ace 100644
--- a/implicit/gpu/knn.cu
+++ b/implicit/gpu/knn.cu
@@ -9,18 +9,21 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <cub/iterator/counting_input_iterator.cuh>
-#include <cub/iterator/transform_input_iterator.cuh>
 #include <cub/device/device_segmented_radix_sort.cuh>
 #include <cub/device/device_radix_sort.cuh>
-#include <cub/block/block_radix_sort.cuh>
 
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/BlockSelectKernel.cuh>
 
 #include "implicit/gpu/utils.cuh"
 #include "implicit/gpu/knn.h"
 #include "implicit/gpu/device_buffer.h"
 
 namespace implicit { namespace gpu {
+namespace {
+    const static int TILE_GROUPS = 32;
+    const static int MAX_TILE_ROWS = 32;
+}
 
 bool is_host_memory(void * address) {
     cudaPointerAttributes attr;
@@ -80,13 +83,18 @@ void copy_columns(const T * input, int rows, int cols, T * output, int output_co
     });
 }
 
-KnnQuery::KnnQuery(size_t temp_memory)
-    : max_temp_memory(temp_memory),
-    alloc(new StackAllocator(temp_memory)) {
+KnnQuery::KnnQuery(size_t temp_memory) {
+    if (!max_temp_memory) {
+      // use half of free GPU memory
+      size_t free, total;
+      CHECK_CUDA(cudaMemGetInfo(&free, &total));
+      temp_memory = std::min(free / 2, static_cast<size_t>(8000000000));
+    }
+    max_temp_memory = temp_memory;
+    alloc.reset(new StackAllocator(temp_memory));
     CHECK_CUBLAS(cublasCreate(&blas_handle));
 }
 
-const static int MAX_SELECT_K = 128;
 
 void KnnQuery::topk(const Matrix & items, const Matrix & query, int k,
                     int * indices, float * distances, float * item_norms,
@@ -96,7 +104,9 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k,
         throw std::invalid_argument("Must have same number of columns in each matrix for topk");
     }
 
-    size_t available_temp_memory = max_temp_memory;
+    // limit to temp memory 8GB or so (causes some issues if we have over 2^31 entries in our
+    // matrix
+    size_t available_temp_memory = std::min(max_temp_memory, static_cast<size_t>(8000000000));
 
     float * host_distances = NULL;
     size_t distances_size = query.rows * k * sizeof(float);
@@ -114,24 +124,39 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k,
         available_temp_memory -= indices_size;
     }
 
+    // Create temporary memory for storing results. We're padding out temp memory so that
+    // we can tile the columns (break up a single row to multiple top-k operations) if there
+    // aren't many rows in the input
+    size_t temp_distances_cols = items.rows;
+    size_t padding = temp_distances_cols % TILE_GROUPS;
+    if (padding) {
+        temp_distances_cols += TILE_GROUPS - padding;
+    }
+
+    // just in case we're tiling each row, we'l need some temp memory for that too
+    available_temp_memory -= TILE_GROUPS * MAX_TILE_ROWS * k * (sizeof(float) + sizeof(int));
+
     // We need 6 copies of the matrix for argsort code - and then some
     // extra memory per SM as well.
-    int batch_size = available_temp_memory / (sizeof(float) * items.rows);
-    if (k >= MAX_SELECT_K) {
+    size_t batch_size = (available_temp_memory / (sizeof(float) * static_cast<size_t>(items.rows)));
+    if (k >= GPU_MAX_SELECTION_K) {
         batch_size *= 0.15;
-    } else {
-        batch_size *= 0.5;
     }
 
-    batch_size = std::min(batch_size, query.rows);
-    batch_size = std::max(batch_size, 1);
+    batch_size = std::min(batch_size, static_cast<size_t>(query.rows));
+    batch_size = std::max(batch_size, static_cast<size_t>(1));
 
-    // Create temporary memory for storing results
-    void * temp_mem = alloc->allocate(batch_size * items.rows * sizeof(float));
-    Matrix temp_distances(batch_size, items.rows, reinterpret_cast<float *>(temp_mem), false);
+    void * temp_mem = alloc->allocate(batch_size * temp_distances_cols * sizeof(float));
+    Matrix temp_distances(batch_size, temp_distances_cols, reinterpret_cast<float *>(temp_mem), false);
+
+    // Fill temp_distances if we're padding so that results don't appear
+    if (padding) {
+        thrust::device_ptr<float> data = thrust::device_pointer_cast(temp_distances.data);
+        thrust::fill(data, data + temp_distances.rows * temp_distances.cols, -FLT_MAX);
+    }
 
     for (int start = 0; start < query.rows; start += batch_size) {
-        auto end = std::min(query.rows, start + batch_size);
+        auto end = std::min(query.rows, start + static_cast<int>(batch_size));
 
         Matrix batch(query, start, end);
         temp_distances.rows = batch.rows;
@@ -149,23 +174,27 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k,
 
         // If we have norms (cosine distance etc) normalize the results here
         if (item_norms != NULL) {
-            auto count = thrust::make_counting_iterator<int>(0);
+            auto count = thrust::make_counting_iterator<size_t>(0);
             int cols = temp_distances.cols;
+            int item_norm_cols = items.rows;
             float * data = temp_distances.data;
-            thrust::for_each(count, count + (temp_distances.rows * temp_distances.cols),
-               [=] __device__(int i) {
-                 data[i] /= item_norms[i % cols];
+            thrust::for_each(count, count + (static_cast<size_t>(temp_distances.rows) * static_cast<size_t>(temp_distances.cols)),
+               [=] __device__(size_t i) {
+                  int col = i % cols;
+                  if (col < item_norm_cols) {
+                    data[i] /= item_norms[col];
+                  }
             });
         }
 
         if (item_filter != NULL) {
-            auto count = thrust::make_counting_iterator<int>(0);
+            auto count = thrust::make_counting_iterator<size_t>(0);
             float * data = temp_distances.data;
             int * items = item_filter->data;
             int items_size = item_filter->size;
             int cols = temp_distances.cols;
             thrust::for_each(count, count + items_size * temp_distances.rows,
-               [=] __device__(int i) {
+               [=] __device__(size_t i) {
                   int col = items[i % items_size];
                   int row = i / items_size;
                   data[row * cols + col] = -FLT_MAX;
@@ -173,7 +202,7 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k,
         }
 
         if (query_filter != NULL) {
-            auto count = thrust::make_counting_iterator<int>(0);
+            auto count = thrust::make_counting_iterator<size_t>(0);
             int * row = query_filter->row;
             int * col = query_filter->col;
             float * data = temp_distances.data;
@@ -203,46 +232,10 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k,
     }
 }
 
-
-static const int ARGPARTITION_BLOCK_DIM_X = 128;
-static const int ARGPARTITION_ITEMS_PER_THREAD = 16;
-static const int ARGPARTITION_SORT_SIZE = ARGPARTITION_BLOCK_DIM_X * ARGPARTITION_ITEMS_PER_THREAD;
-
-__global__ void argpartition_kernel(const int * indices, const float * distances,
-                                    int rows, int cols, int k,
-                                    int * out_indices,
-                                    float * out_distances) {
-    using BlockRadixSort = cub::BlockRadixSort<float, ARGPARTITION_BLOCK_DIM_X, ARGPARTITION_ITEMS_PER_THREAD, int>;
-    __shared__ typename BlockRadixSort::TempStorage shared_mem;
-
-    float keys[ARGPARTITION_ITEMS_PER_THREAD];
-    int values[ARGPARTITION_ITEMS_PER_THREAD];
-
-    int rowid = blockIdx.y;
-    for (int i = 0; i < ARGPARTITION_ITEMS_PER_THREAD; i++) {
-        int colid = blockIdx.x * blockDim.x + threadIdx.x + i * (blockDim.x * gridDim.x);
-        if (colid < cols) {
-            keys[i] = distances[rowid * cols + colid];
-            values[i] = indices == NULL ? colid : indices[rowid * cols + colid];
-        } else {
-            keys[i] = -FLT_MAX;
-            values[i] = -1;
-        }
-    }
-
-    BlockRadixSort(shared_mem).SortDescendingBlockedToStriped(keys, values);
-
-    if (threadIdx.x < k) {
-        int out_col = threadIdx.x + blockIdx.x * k;
-        out_distances[out_col + rowid * k * gridDim.x] = keys[0];
-        out_indices[out_col + rowid * k * gridDim.x] = values[0];
-    }
-}
-
 void KnnQuery::argpartition(const Matrix & items, int k, int * indices, float * distances) {
     k = std::min(k, items.cols);
 
-    if (k >= MAX_SELECT_K) {
+    if (k >= GPU_MAX_SELECTION_K) {
         int * temp_indices = reinterpret_cast<int *>(alloc->allocate(items.rows * items.cols * sizeof(int)));
         float * temp_distances = reinterpret_cast<float *>(alloc->allocate(items.rows * items.cols * sizeof(float)));
         argsort(items, temp_indices, temp_distances);
@@ -256,58 +249,48 @@ void KnnQuery::argpartition(const Matrix & items, int k, int * indices, float *
     int rows = items.rows;
     int cols = items.cols;
 
-    int blocks_per_row = (cols + ARGPARTITION_SORT_SIZE - 1) / ARGPARTITION_SORT_SIZE;
-
-    // maintain a double buffer of input/output indices and distances
-    float * distA = reinterpret_cast<float *>(alloc->allocate(rows * k * blocks_per_row * sizeof(float)));
-    int * indA = reinterpret_cast<int *>(alloc->allocate(rows * k * blocks_per_row * sizeof(int)));
-    blocks_per_row = (blocks_per_row * k + ARGPARTITION_SORT_SIZE - 1) / ARGPARTITION_SORT_SIZE;
-    float * distB = reinterpret_cast<float *>(alloc->allocate(rows * k * blocks_per_row * sizeof(float)));
-    int * indB = reinterpret_cast<int *>(alloc->allocate(rows * k * blocks_per_row * sizeof(int)));
-
-    const float * input_distances = items.data;
-    const int * input_indices = NULL;
-    float * output_distances = distA;
-    int * output_indices = indA;
-    bool outputA = true;
-
-    while (true) {
-        int blocks_per_row = (cols + ARGPARTITION_SORT_SIZE - 1) / ARGPARTITION_SORT_SIZE;
-        dim3 block_count(blocks_per_row, items.rows, 1);
-
-        bool final = block_count.x <= 1;
-        if (final) {
-            output_distances = distances;
-            output_indices = indices;
-        }
-
-        argpartition_kernel<<<block_count, ARGPARTITION_BLOCK_DIM_X>>>(
-            input_indices, input_distances,
-            rows, cols, k,
-            output_indices, output_distances);
-
-        if (final) break;
+    // faiss runBlockSelect isn't the fastest option when there aren't that many rows, since
+    // each row in the query only gets a single thread block to process it. For queries with
+    // a small number of rows, we're going to break up each row into TILE_GROUPS sub-rows,
+    // in one runBlockSelect, and then combine the results from those in a final select op.
+    bool tile_rows = (rows <= MAX_TILE_ROWS) && (cols % TILE_GROUPS == 0) && (cols >= 65536);
+    if (tile_rows) {
+        // Run the first block select on the sub-rows
+        int rows_tile = rows * TILE_GROUPS;
+        int cols_tile = cols / TILE_GROUPS;
+        int * temp_indices = reinterpret_cast<int *>(alloc->allocate(rows_tile * k * sizeof(int)));
+        float * temp_distances = reinterpret_cast<float *>(alloc->allocate(rows_tile * k * sizeof(float)));
+        faiss::gpu::DeviceTensor<float, 2, true> items_tensor(const_cast<float *>(items.data), {rows_tile, cols_tile});
+        faiss::gpu::DeviceTensor<float, 2, true> temp_distances_tensor(temp_distances, {rows_tile, k});
+        faiss::gpu::DeviceTensor<int, 2, true> temp_indices_tensor(temp_indices, {rows_tile, k});
+        faiss::gpu::runBlockSelect(items_tensor, temp_distances_tensor, temp_indices_tensor, true, k, 0);
+
+        // Calculate the true index for all the topk results (since the current temp_indices will be relative to the split values)
+        auto count = thrust::make_counting_iterator<size_t>(0);
+        thrust::for_each(count, count + rows_tile * k,
+            [=] __device__(int i) {
+                int offset = cols_tile * ((i / k) % TILE_GROUPS);
+                temp_indices[i] += offset;
+            });
 
-        // reduce the number of columns we process next iteration to the output of the current
-        // input
-        cols = block_count.x * k;
+        // reshape the temp tensors we calculated in the first pass, and then get the actual
+        // output
+        faiss::gpu::DeviceTensor<float, 2, true> temp_input_distances_tensor(temp_distances, {rows, k * TILE_GROUPS});
+        faiss::gpu::DeviceTensor<int, 2, true> temp_input_indices_tensor(temp_indices, {rows, k * TILE_GROUPS});
+        faiss::gpu::DeviceTensor<float, 2, true> distances_tensor(distances, {rows, k});
+        faiss::gpu::DeviceTensor<int, 2, true> indices_tensor(indices, {rows, k});
+        faiss::gpu::runBlockSelectPair(temp_input_distances_tensor, temp_input_indices_tensor, distances_tensor, indices_tensor, true, k, 0);
 
-        // set the input of the next run to the output of the current run
-        // (and the output to an unused block of memory)
-        input_distances = output_distances;
-        input_indices = output_indices;
-        output_distances = outputA ? distB : distA;
-        output_indices = outputA ? indB : indA;
-        outputA = !outputA;
+        alloc->deallocate(temp_distances);
+        alloc->deallocate(temp_indices);
+    } else {
+        faiss::gpu::DeviceTensor<float, 2, true> items_tensor(const_cast<float *>(items.data), {rows, cols});
+        faiss::gpu::DeviceTensor<float, 2, true> distances_tensor(distances, {rows, k});
+        faiss::gpu::DeviceTensor<int, 2, true> indices_tensor(indices, {rows, k});
+        faiss::gpu::runBlockSelect(items_tensor, distances_tensor, indices_tensor, true, k, 0);
     }
 
     CHECK_CUDA(cudaDeviceSynchronize());
-
-    // Free up temp memory
-    alloc->deallocate(indB);
-    alloc->deallocate(distB);
-    alloc->deallocate(indA);
-    alloc->deallocate(distA);
 }
 
 void KnnQuery::argsort(const Matrix & items, int * indices, float * distances) {
diff --git a/implicit/gpu/knn.h b/implicit/gpu/knn.h
index d4e499d2..52c67d2c 100644
--- a/implicit/gpu/knn.h
+++ b/implicit/gpu/knn.h
@@ -11,7 +11,7 @@ struct StackAllocator;
 
 class KnnQuery {
 public:
-    KnnQuery(size_t temp_memory=512000000);
+    KnnQuery(size_t temp_memory=0);
     ~KnnQuery();
     cublasContext * blas_handle;
 
diff --git a/implicit/gpu/matrix_factorization_base.py b/implicit/gpu/matrix_factorization_base.py
index cdef3697..a6234921 100644
--- a/implicit/gpu/matrix_factorization_base.py
+++ b/implicit/gpu/matrix_factorization_base.py
@@ -29,7 +29,7 @@ def __init__(self):
         self._user_norms = None
         self._user_norms_host = None
         self._item_norms_host = None
-        self._knn = implicit.gpu.KnnQuery()
+        self._knn = None
 
     def recommend(
         self,
@@ -80,7 +80,7 @@ def recommend(
                 query_filter = None
 
         # calculate the top N items, removing the users own liked items from the results
-        ids, scores = self._knn.topk(
+        ids, scores = self.knn.topk(
             item_factors,
             self.user_factors[userid],
             N,
@@ -112,6 +112,12 @@ def item_norms(self):
             self._item_norms_host = self._item_norms.to_numpy().reshape(self._item_norms.shape[1])
         return self._item_norms
 
+    @property
+    def knn(self):
+        if self._knn is None:
+            self._knn = implicit.gpu.KnnQuery()
+        return self._knn
+
     def similar_users(self, userid, N=10, filter_users=None, users=None):
         norms = self.user_norms
         user_factors = self.user_factors
@@ -132,7 +138,7 @@ def similar_users(self, userid, N=10, filter_users=None, users=None):
         if filter_users is not None:
             filter_users = implicit.gpu.IntVector(np.array(filter_users, dtype="int32"))
 
-        ids, scores = self._knn.topk(
+        ids, scores = self.knn.topk(
             user_factors, self.user_factors[userid], N, norms, item_filter=filter_users
         )
 
@@ -174,7 +180,7 @@ def similar_items(
         if filter_items is not None:
             filter_items = implicit.gpu.IntVector(np.array(filter_items, dtype="int32"))
 
-        ids, scores = self._knn.topk(
+        ids, scores = self.knn.topk(
             item_factors, self.item_factors[itemid], N, norms, item_filter=filter_items
         )
 
diff --git a/setup.py b/setup.py
index 3dffb043..ba8d3d7f 100644
--- a/setup.py
+++ b/setup.py
@@ -92,7 +92,7 @@ def define_extensions():
                 # extra_compile_args=compile_args,
                 extra_link_args=link_args,
                 library_dirs=library_dirs,
-                libraries=["cudart", "cublas", "curand"],
+                libraries=["cudart", "cublas", "curand", "faiss"],
                 include_dirs=include_dirs,
             )
         )

From 0e3f2234f2044018748e8194a8891c344b0ba2be Mon Sep 17 00:00:00 2001
From: Ben Frederickson <github@benfrederickson.com>
Date: Thu, 6 Jan 2022 14:42:22 -0800
Subject: [PATCH 2/5] Staticly link faiss

Include faiss as a gitsubmodule, and then link the couple of functions we need
statically
---
 .gitmodules      |  3 +++
 MANIFEST.in      | 12 ++++++++++--
 cuda_setup.py    | 35 +++++------------------------------
 setup.cfg        |  2 +-
 setup.py         | 19 ++++++++++++++++---
 thirdparty/faiss |  1 +
 6 files changed, 36 insertions(+), 36 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 thirdparty/faiss

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..6120fd77
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "thirdparty/faiss"]
+	path = thirdparty/faiss
+	url = https://github.com/facebookresearch/faiss.git
diff --git a/MANIFEST.in b/MANIFEST.in
index 76e9ebb3..63eb856e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,8 +1,16 @@
 recursive-include implicit *.h *.cu *.cuh *.pyx *.py *.cpp *.pxd
-recursive-exclude tests *
-recursive-exclude examples *
 include cuda_setup.py
 include README.md
 include requirements.txt
 include LICENSE
 include tox.ini
+
+recursive-include thirdparty/faiss/faiss/gpu/utils/ *.cu *.cuh *.h
+include thirdparty/faiss/faiss/gpu/GpuResources.cpp
+include thirdparty/faiss/faiss/gpu/GpuResources.h
+include thirdparty/faiss/faiss/gpu/GpuFaissAssert.h
+include thirdparty/faiss/faiss/gpu/DeviceUtils.h
+include thirdparty/faiss/faiss/impl/FaissAssert.h
+include thirdparty/faiss/faiss/impl/FaissException.h
+include thirdparty/faiss/faiss/impl/platform_macros.h
+include thirdparty/faiss/faiss/MetricType.h
diff --git a/cuda_setup.py b/cuda_setup.py
index 443a510b..bfd5bd89 100644
--- a/cuda_setup.py
+++ b/cuda_setup.py
@@ -92,14 +92,8 @@ class _UnixCCompiler(unixccompiler.UnixCCompiler):
     src_extensions.append(".cu")
 
     def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
-        # For sources other than CUDA C ones, just call the super class method.
-        if os.path.splitext(src)[1] != ".cu":
-            return unixccompiler.UnixCCompiler._compile(
-                self, obj, src, ext, cc_args, extra_postargs, pp_opts
-            )
-
         # For CUDA C source files, compile them with NVCC.
-        _compiler_so = self.compiler_so
+        _compiler_so = self.compiler_so  # pylint: disable=access-member-before-definition
         try:
             nvcc_path = CUDA["nvcc"]
             post_args = CUDA["post_args"]
@@ -110,7 +104,7 @@ def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
                 self, obj, src, ext, cc_args, post_args, pp_opts
             )
         finally:
-            self.compiler_so = _compiler_so
+            self.compiler_so = _compiler_so  # pylint: disable=attribute-defined-outside-init
 
 
 class _MSVCCompiler(msvccompiler.MSVCCompiler):
@@ -119,7 +113,7 @@ class _MSVCCompiler(msvccompiler.MSVCCompiler):
     src_extensions = list(unixccompiler.UnixCCompiler.src_extensions)
     src_extensions.extend(_cu_extensions)
 
-    def _compile_cu(
+    def compile(
         self,
         sources,
         output_dir=None,
@@ -151,25 +145,6 @@ def _compile_cu(
 
         return objects
 
-    def compile(self, sources, **kwargs):
-        # Split CUDA C sources and others.
-        cu_sources = []
-        other_sources = []
-        for source in sources:
-            if os.path.splitext(source)[1] == ".cu":
-                cu_sources.append(source)
-            else:
-                other_sources.append(source)
-
-        # Compile source files other than CUDA C ones.
-        other_objects = msvccompiler.MSVCCompiler.compile(self, other_sources, **kwargs)
-
-        # Compile CUDA C sources.
-        cu_objects = self._compile_cu(cu_sources, **kwargs)
-
-        # Return compiled object filenames.
-        return other_objects + cu_objects
-
 
 class cuda_build_ext(setuptools_build_ext):
     """Custom `build_ext` command to include CUDA C source files."""
@@ -182,7 +157,7 @@ def _wrap_new_compiler(*args, **kwargs):
                     try:
                         return func(*args, **kwargs)
                     except errors.DistutilsPlatformError:
-                        if not sys.platform == "win32":
+                        if sys.platform != "win32":
                             CCompiler = _UnixCCompiler
                         else:
                             CCompiler = _MSVCCompiler
@@ -193,7 +168,7 @@ def _wrap_new_compiler(*args, **kwargs):
             ccompiler.new_compiler = wrap_new_compiler(ccompiler.new_compiler)
             # Intentionally causes DistutilsPlatformError in
             # ccompiler.new_compiler() function to hook.
-            self.compiler = "nvidia"
+            self.compiler = "nvidia"  # pylint: disable=attribute-defined-outside-init
 
         setuptools_build_ext.run(self)
 
diff --git a/setup.cfg b/setup.cfg
index 4bb5ac10..91133161 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -9,7 +9,7 @@ description-file = README.md
 [flake8]
 max-line-length = 100
 exclude = build,.eggs,.tox
-ignore = E203
+ignore = E203,W503
 
 [isort]
 multi_line_output = 3
diff --git a/setup.py b/setup.py
index ba8d3d7f..45d3323f 100644
--- a/setup.py
+++ b/setup.py
@@ -68,17 +68,30 @@ def define_extensions():
     )
 
     if CUDA:
+        faiss_path = os.path.join("thirdparty", "faiss")
         conda_prefix = os.getenv("CONDA_PREFIX")
-        include_dirs = [CUDA["include"], "."]
+        include_dirs = [faiss_path, CUDA["include"], "."]
         library_dirs = [CUDA["lib64"]]
+        include_dirs.append(faiss_path)
         if conda_prefix:
             include_dirs.append(os.path.join(conda_prefix, "include"))
             library_dirs.append(os.path.join(conda_prefix, "lib"))
 
+        block_select_dir = os.path.join(faiss_path, "faiss", "gpu", "utils", "blockselect")
+        faiss_paths = [
+            os.path.join(block_select_dir, p)
+            for p in os.listdir(block_select_dir)
+            if p.endswith(".cu")
+        ]
+        faiss_paths.append(os.path.join(faiss_path, "faiss", "gpu", "utils", "BlockSelectFloat.cu"))
+        faiss_paths.append(os.path.join(faiss_path, "faiss", "gpu", "GpuResources.cpp"))
+        faiss_paths.append(os.path.join(faiss_path, "faiss", "gpu", "utils", "DeviceUtils.cu"))
+
         modules.append(
             Extension(
                 "implicit.gpu._cuda",
-                [
+                faiss_paths
+                + [
                     os.path.join("implicit", "gpu", "_cuda" + src_ext),
                     os.path.join("implicit", "gpu", "als.cu"),
                     os.path.join("implicit", "gpu", "bpr.cu"),
@@ -92,7 +105,7 @@ def define_extensions():
                 # extra_compile_args=compile_args,
                 extra_link_args=link_args,
                 library_dirs=library_dirs,
-                libraries=["cudart", "cublas", "curand", "faiss"],
+                libraries=["cudart", "cublas", "curand"],
                 include_dirs=include_dirs,
             )
         )
diff --git a/thirdparty/faiss b/thirdparty/faiss
new file mode 160000
index 00000000..c08cbff1
--- /dev/null
+++ b/thirdparty/faiss
@@ -0,0 +1 @@
+Subproject commit c08cbff1a4d6c9afb6b8f69004c5530aaf80237a

From bd5689d0ee91f9c115f465040bd33b7fe731cc7f Mon Sep 17 00:00:00 2001
From: Ben Frederickson <github@benfrederickson.com>
Date: Thu, 6 Jan 2022 21:35:34 -0800
Subject: [PATCH 3/5] update manifest

---
 MANIFEST.in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 63eb856e..b8bfba1f 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -9,7 +9,6 @@ recursive-include thirdparty/faiss/faiss/gpu/utils/ *.cu *.cuh *.h
 include thirdparty/faiss/faiss/gpu/GpuResources.cpp
 include thirdparty/faiss/faiss/gpu/GpuResources.h
 include thirdparty/faiss/faiss/gpu/GpuFaissAssert.h
-include thirdparty/faiss/faiss/gpu/DeviceUtils.h
 include thirdparty/faiss/faiss/impl/FaissAssert.h
 include thirdparty/faiss/faiss/impl/FaissException.h
 include thirdparty/faiss/faiss/impl/platform_macros.h

From 2782cfc84ff9173c9cfea75e7f116d3cbe05213c Mon Sep 17 00:00:00 2001
From: Ben Frederickson <github@benfrederickson.com>
Date: Thu, 6 Jan 2022 22:16:44 -0800
Subject: [PATCH 4/5] Fix rank_items tests

---
 implicit/gpu/knn.cu | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/implicit/gpu/knn.cu b/implicit/gpu/knn.cu
index 30b02ace..0c2d9019 100644
--- a/implicit/gpu/knn.cu
+++ b/implicit/gpu/knn.cu
@@ -23,6 +23,13 @@ namespace implicit { namespace gpu {
 namespace {
     const static int TILE_GROUPS = 32;
     const static int MAX_TILE_ROWS = 32;
+
+    // faiss seems to have issues when distances contain -FLT_MAX, and can return a '-1' in the
+    // indices returned, instead of an actual valid row number. When we filter, instead of
+    // setting to -FLT_MAX, set to the next smallest valid float32 value.
+    const static float _FLT_MAX = FLT_MAX;
+    const static uint32_t UINT_FILTER_DISTANCE = (*reinterpret_cast<const uint32_t*>(&_FLT_MAX)) - 1;
+    const static float FLT_FILTER_DISTANCE = - *reinterpret_cast<const float*>(&UINT_FILTER_DISTANCE);
 }
 
 bool is_host_memory(void * address) {
@@ -193,11 +200,12 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k,
             int * items = item_filter->data;
             int items_size = item_filter->size;
             int cols = temp_distances.cols;
+            float filter_distance = FLT_FILTER_DISTANCE;
             thrust::for_each(count, count + items_size * temp_distances.rows,
                [=] __device__(size_t i) {
                   int col = items[i % items_size];
                   int row = i / items_size;
-                  data[row * cols + col] = -FLT_MAX;
+                  data[row * cols + col] = filter_distance;
             });
         }
 
@@ -207,10 +215,11 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k,
             int * col = query_filter->col;
             float * data = temp_distances.data;
             int items = temp_distances.cols;
+            float filter_distance = FLT_FILTER_DISTANCE;
             thrust::for_each(count, count + query_filter->nonzeros,
                 [=] __device__(int i) {
                     if ((row[i] >= start) && (row[i] < end)) {
-                        data[(row[i] -start) * items + col[i]] = -FLT_MAX;
+                        data[(row[i] - start) * items + col[i]] = filter_distance;
                     }
             });
         }

From 8caa5e1d85216aeffb38f974da53d6a6319e0522 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <github@benfrederickson.com>
Date: Fri, 7 Jan 2022 10:23:50 -0800
Subject: [PATCH 5/5] Add sm80/sm86 to cuda_Setup.py

avoids long ptx JIT times on startup
---
 cuda_setup.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/cuda_setup.py b/cuda_setup.py
index bfd5bd89..07f89f81 100644
--- a/cuda_setup.py
+++ b/cuda_setup.py
@@ -59,16 +59,29 @@ def locate_cuda():
         "lib64": os.path.join(home, "lib64"),
     }
 
-    post_args = [
+    arch_flags = [
         "-arch=sm_60",
+        "-gencode=arch=compute_37,code=sm_37",
+        "-gencode=arch=compute_50,code=sm_50",
+        "-gencode=arch=compute_52,code=sm_52",
         "-gencode=arch=compute_60,code=sm_60",
         "-gencode=arch=compute_61,code=sm_61",
         "-gencode=arch=compute_70,code=sm_70",
-        "-gencode=arch=compute_70,code=compute_70",
+        "-gencode=arch=compute_75,code=sm_75",
+        "-gencode=arch=compute_80,code=sm_80",
+        "-gencode=arch=compute_86,code=sm_86",
+        "-gencode=arch=compute_86,code=compute_86",
+    ]
+
+    # hack to speed up cuda compilation on my devbox
+    if os.getenv("IMPLICIT_CUDA_ARCH") == "sm86":
+        arch_flags = ["-arch=sm_86", "-gencode=arch=compute_86,code=sm_86"]
+
+    post_args = [
         "--ptxas-options=-v",
         "--extended-lambda",
         "-O2",
-    ]
+    ] + arch_flags
 
     if sys.platform == "win32":
         cudaconfig["lib64"] = os.path.join(home, "lib", "x64")