From 9b01073f17e844c4f1faccb800d77f349cb0f294 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Thu, 6 Jan 2022 12:26:22 -0800 Subject: [PATCH 1/5] Use faiss for gpu inference --- implicit/gpu/knn.cu | 201 ++++++++++------------ implicit/gpu/knn.h | 2 +- implicit/gpu/matrix_factorization_base.py | 14 +- setup.py | 2 +- 4 files changed, 104 insertions(+), 115 deletions(-) diff --git a/implicit/gpu/knn.cu b/implicit/gpu/knn.cu index 5c3d7608..30b02ace 100644 --- a/implicit/gpu/knn.cu +++ b/implicit/gpu/knn.cu @@ -9,18 +9,21 @@ #include #include -#include -#include #include #include -#include +#include +#include #include "implicit/gpu/utils.cuh" #include "implicit/gpu/knn.h" #include "implicit/gpu/device_buffer.h" namespace implicit { namespace gpu { +namespace { + const static int TILE_GROUPS = 32; + const static int MAX_TILE_ROWS = 32; +} bool is_host_memory(void * address) { cudaPointerAttributes attr; @@ -80,13 +83,18 @@ void copy_columns(const T * input, int rows, int cols, T * output, int output_co }); } -KnnQuery::KnnQuery(size_t temp_memory) - : max_temp_memory(temp_memory), - alloc(new StackAllocator(temp_memory)) { +KnnQuery::KnnQuery(size_t temp_memory) { + if (!max_temp_memory) { + // use half of free GPU memory + size_t free, total; + CHECK_CUDA(cudaMemGetInfo(&free, &total)); + temp_memory = std::min(free / 2, static_cast(8000000000)); + } + max_temp_memory = temp_memory; + alloc.reset(new StackAllocator(temp_memory)); CHECK_CUBLAS(cublasCreate(&blas_handle)); } -const static int MAX_SELECT_K = 128; void KnnQuery::topk(const Matrix & items, const Matrix & query, int k, int * indices, float * distances, float * item_norms, @@ -96,7 +104,9 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k, throw std::invalid_argument("Must have same number of columns in each matrix for topk"); } - size_t available_temp_memory = max_temp_memory; + // limit to temp memory 8GB or so (causes some issues if we have over 2^31 entries in our + // matrix + size_t available_temp_memory = std::min(max_temp_memory, static_cast(8000000000)); float * host_distances = NULL; size_t distances_size = query.rows * k * sizeof(float); @@ -114,24 +124,39 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k, available_temp_memory -= indices_size; } + // Create temporary memory for storing results. We're padding out temp memory so that + // we can tile the columns (break up a single row to multiple top-k operations) if there + // aren't many rows in the input + size_t temp_distances_cols = items.rows; + size_t padding = temp_distances_cols % TILE_GROUPS; + if (padding) { + temp_distances_cols += TILE_GROUPS - padding; + } + + // just in case we're tiling each row, we'l need some temp memory for that too + available_temp_memory -= TILE_GROUPS * MAX_TILE_ROWS * k * (sizeof(float) + sizeof(int)); + // We need 6 copies of the matrix for argsort code - and then some // extra memory per SM as well. - int batch_size = available_temp_memory / (sizeof(float) * items.rows); - if (k >= MAX_SELECT_K) { + size_t batch_size = (available_temp_memory / (sizeof(float) * static_cast(items.rows))); + if (k >= GPU_MAX_SELECTION_K) { batch_size *= 0.15; - } else { - batch_size *= 0.5; } - batch_size = std::min(batch_size, query.rows); - batch_size = std::max(batch_size, 1); + batch_size = std::min(batch_size, static_cast(query.rows)); + batch_size = std::max(batch_size, static_cast(1)); - // Create temporary memory for storing results - void * temp_mem = alloc->allocate(batch_size * items.rows * sizeof(float)); - Matrix temp_distances(batch_size, items.rows, reinterpret_cast(temp_mem), false); + void * temp_mem = alloc->allocate(batch_size * temp_distances_cols * sizeof(float)); + Matrix temp_distances(batch_size, temp_distances_cols, reinterpret_cast(temp_mem), false); + + // Fill temp_distances if we're padding so that results don't appear + if (padding) { + thrust::device_ptr data = thrust::device_pointer_cast(temp_distances.data); + thrust::fill(data, data + temp_distances.rows * temp_distances.cols, -FLT_MAX); + } for (int start = 0; start < query.rows; start += batch_size) { - auto end = std::min(query.rows, start + batch_size); + auto end = std::min(query.rows, start + static_cast(batch_size)); Matrix batch(query, start, end); temp_distances.rows = batch.rows; @@ -149,23 +174,27 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k, // If we have norms (cosine distance etc) normalize the results here if (item_norms != NULL) { - auto count = thrust::make_counting_iterator(0); + auto count = thrust::make_counting_iterator(0); int cols = temp_distances.cols; + int item_norm_cols = items.rows; float * data = temp_distances.data; - thrust::for_each(count, count + (temp_distances.rows * temp_distances.cols), - [=] __device__(int i) { - data[i] /= item_norms[i % cols]; + thrust::for_each(count, count + (static_cast(temp_distances.rows) * static_cast(temp_distances.cols)), + [=] __device__(size_t i) { + int col = i % cols; + if (col < item_norm_cols) { + data[i] /= item_norms[col]; + } }); } if (item_filter != NULL) { - auto count = thrust::make_counting_iterator(0); + auto count = thrust::make_counting_iterator(0); float * data = temp_distances.data; int * items = item_filter->data; int items_size = item_filter->size; int cols = temp_distances.cols; thrust::for_each(count, count + items_size * temp_distances.rows, - [=] __device__(int i) { + [=] __device__(size_t i) { int col = items[i % items_size]; int row = i / items_size; data[row * cols + col] = -FLT_MAX; @@ -173,7 +202,7 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k, } if (query_filter != NULL) { - auto count = thrust::make_counting_iterator(0); + auto count = thrust::make_counting_iterator(0); int * row = query_filter->row; int * col = query_filter->col; float * data = temp_distances.data; @@ -203,46 +232,10 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k, } } - -static const int ARGPARTITION_BLOCK_DIM_X = 128; -static const int ARGPARTITION_ITEMS_PER_THREAD = 16; -static const int ARGPARTITION_SORT_SIZE = ARGPARTITION_BLOCK_DIM_X * ARGPARTITION_ITEMS_PER_THREAD; - -__global__ void argpartition_kernel(const int * indices, const float * distances, - int rows, int cols, int k, - int * out_indices, - float * out_distances) { - using BlockRadixSort = cub::BlockRadixSort; - __shared__ typename BlockRadixSort::TempStorage shared_mem; - - float keys[ARGPARTITION_ITEMS_PER_THREAD]; - int values[ARGPARTITION_ITEMS_PER_THREAD]; - - int rowid = blockIdx.y; - for (int i = 0; i < ARGPARTITION_ITEMS_PER_THREAD; i++) { - int colid = blockIdx.x * blockDim.x + threadIdx.x + i * (blockDim.x * gridDim.x); - if (colid < cols) { - keys[i] = distances[rowid * cols + colid]; - values[i] = indices == NULL ? colid : indices[rowid * cols + colid]; - } else { - keys[i] = -FLT_MAX; - values[i] = -1; - } - } - - BlockRadixSort(shared_mem).SortDescendingBlockedToStriped(keys, values); - - if (threadIdx.x < k) { - int out_col = threadIdx.x + blockIdx.x * k; - out_distances[out_col + rowid * k * gridDim.x] = keys[0]; - out_indices[out_col + rowid * k * gridDim.x] = values[0]; - } -} - void KnnQuery::argpartition(const Matrix & items, int k, int * indices, float * distances) { k = std::min(k, items.cols); - if (k >= MAX_SELECT_K) { + if (k >= GPU_MAX_SELECTION_K) { int * temp_indices = reinterpret_cast(alloc->allocate(items.rows * items.cols * sizeof(int))); float * temp_distances = reinterpret_cast(alloc->allocate(items.rows * items.cols * sizeof(float))); argsort(items, temp_indices, temp_distances); @@ -256,58 +249,48 @@ void KnnQuery::argpartition(const Matrix & items, int k, int * indices, float * int rows = items.rows; int cols = items.cols; - int blocks_per_row = (cols + ARGPARTITION_SORT_SIZE - 1) / ARGPARTITION_SORT_SIZE; - - // maintain a double buffer of input/output indices and distances - float * distA = reinterpret_cast(alloc->allocate(rows * k * blocks_per_row * sizeof(float))); - int * indA = reinterpret_cast(alloc->allocate(rows * k * blocks_per_row * sizeof(int))); - blocks_per_row = (blocks_per_row * k + ARGPARTITION_SORT_SIZE - 1) / ARGPARTITION_SORT_SIZE; - float * distB = reinterpret_cast(alloc->allocate(rows * k * blocks_per_row * sizeof(float))); - int * indB = reinterpret_cast(alloc->allocate(rows * k * blocks_per_row * sizeof(int))); - - const float * input_distances = items.data; - const int * input_indices = NULL; - float * output_distances = distA; - int * output_indices = indA; - bool outputA = true; - - while (true) { - int blocks_per_row = (cols + ARGPARTITION_SORT_SIZE - 1) / ARGPARTITION_SORT_SIZE; - dim3 block_count(blocks_per_row, items.rows, 1); - - bool final = block_count.x <= 1; - if (final) { - output_distances = distances; - output_indices = indices; - } - - argpartition_kernel<<>>( - input_indices, input_distances, - rows, cols, k, - output_indices, output_distances); - - if (final) break; + // faiss runBlockSelect isn't the fastest option when there aren't that many rows, since + // each row in the query only gets a single thread block to process it. For queries with + // a small number of rows, we're going to break up each row into TILE_GROUPS sub-rows, + // in one runBlockSelect, and then combine the results from those in a final select op. + bool tile_rows = (rows <= MAX_TILE_ROWS) && (cols % TILE_GROUPS == 0) && (cols >= 65536); + if (tile_rows) { + // Run the first block select on the sub-rows + int rows_tile = rows * TILE_GROUPS; + int cols_tile = cols / TILE_GROUPS; + int * temp_indices = reinterpret_cast(alloc->allocate(rows_tile * k * sizeof(int))); + float * temp_distances = reinterpret_cast(alloc->allocate(rows_tile * k * sizeof(float))); + faiss::gpu::DeviceTensor items_tensor(const_cast(items.data), {rows_tile, cols_tile}); + faiss::gpu::DeviceTensor temp_distances_tensor(temp_distances, {rows_tile, k}); + faiss::gpu::DeviceTensor temp_indices_tensor(temp_indices, {rows_tile, k}); + faiss::gpu::runBlockSelect(items_tensor, temp_distances_tensor, temp_indices_tensor, true, k, 0); + + // Calculate the true index for all the topk results (since the current temp_indices will be relative to the split values) + auto count = thrust::make_counting_iterator(0); + thrust::for_each(count, count + rows_tile * k, + [=] __device__(int i) { + int offset = cols_tile * ((i / k) % TILE_GROUPS); + temp_indices[i] += offset; + }); - // reduce the number of columns we process next iteration to the output of the current - // input - cols = block_count.x * k; + // reshape the temp tensors we calculated in the first pass, and then get the actual + // output + faiss::gpu::DeviceTensor temp_input_distances_tensor(temp_distances, {rows, k * TILE_GROUPS}); + faiss::gpu::DeviceTensor temp_input_indices_tensor(temp_indices, {rows, k * TILE_GROUPS}); + faiss::gpu::DeviceTensor distances_tensor(distances, {rows, k}); + faiss::gpu::DeviceTensor indices_tensor(indices, {rows, k}); + faiss::gpu::runBlockSelectPair(temp_input_distances_tensor, temp_input_indices_tensor, distances_tensor, indices_tensor, true, k, 0); - // set the input of the next run to the output of the current run - // (and the output to an unused block of memory) - input_distances = output_distances; - input_indices = output_indices; - output_distances = outputA ? distB : distA; - output_indices = outputA ? indB : indA; - outputA = !outputA; + alloc->deallocate(temp_distances); + alloc->deallocate(temp_indices); + } else { + faiss::gpu::DeviceTensor items_tensor(const_cast(items.data), {rows, cols}); + faiss::gpu::DeviceTensor distances_tensor(distances, {rows, k}); + faiss::gpu::DeviceTensor indices_tensor(indices, {rows, k}); + faiss::gpu::runBlockSelect(items_tensor, distances_tensor, indices_tensor, true, k, 0); } CHECK_CUDA(cudaDeviceSynchronize()); - - // Free up temp memory - alloc->deallocate(indB); - alloc->deallocate(distB); - alloc->deallocate(indA); - alloc->deallocate(distA); } void KnnQuery::argsort(const Matrix & items, int * indices, float * distances) { diff --git a/implicit/gpu/knn.h b/implicit/gpu/knn.h index d4e499d2..52c67d2c 100644 --- a/implicit/gpu/knn.h +++ b/implicit/gpu/knn.h @@ -11,7 +11,7 @@ struct StackAllocator; class KnnQuery { public: - KnnQuery(size_t temp_memory=512000000); + KnnQuery(size_t temp_memory=0); ~KnnQuery(); cublasContext * blas_handle; diff --git a/implicit/gpu/matrix_factorization_base.py b/implicit/gpu/matrix_factorization_base.py index cdef3697..a6234921 100644 --- a/implicit/gpu/matrix_factorization_base.py +++ b/implicit/gpu/matrix_factorization_base.py @@ -29,7 +29,7 @@ def __init__(self): self._user_norms = None self._user_norms_host = None self._item_norms_host = None - self._knn = implicit.gpu.KnnQuery() + self._knn = None def recommend( self, @@ -80,7 +80,7 @@ def recommend( query_filter = None # calculate the top N items, removing the users own liked items from the results - ids, scores = self._knn.topk( + ids, scores = self.knn.topk( item_factors, self.user_factors[userid], N, @@ -112,6 +112,12 @@ def item_norms(self): self._item_norms_host = self._item_norms.to_numpy().reshape(self._item_norms.shape[1]) return self._item_norms + @property + def knn(self): + if self._knn is None: + self._knn = implicit.gpu.KnnQuery() + return self._knn + def similar_users(self, userid, N=10, filter_users=None, users=None): norms = self.user_norms user_factors = self.user_factors @@ -132,7 +138,7 @@ def similar_users(self, userid, N=10, filter_users=None, users=None): if filter_users is not None: filter_users = implicit.gpu.IntVector(np.array(filter_users, dtype="int32")) - ids, scores = self._knn.topk( + ids, scores = self.knn.topk( user_factors, self.user_factors[userid], N, norms, item_filter=filter_users ) @@ -174,7 +180,7 @@ def similar_items( if filter_items is not None: filter_items = implicit.gpu.IntVector(np.array(filter_items, dtype="int32")) - ids, scores = self._knn.topk( + ids, scores = self.knn.topk( item_factors, self.item_factors[itemid], N, norms, item_filter=filter_items ) diff --git a/setup.py b/setup.py index 3dffb043..ba8d3d7f 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def define_extensions(): # extra_compile_args=compile_args, extra_link_args=link_args, library_dirs=library_dirs, - libraries=["cudart", "cublas", "curand"], + libraries=["cudart", "cublas", "curand", "faiss"], include_dirs=include_dirs, ) ) From 0e3f2234f2044018748e8194a8891c344b0ba2be Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Thu, 6 Jan 2022 14:42:22 -0800 Subject: [PATCH 2/5] Staticly link faiss Include faiss as a gitsubmodule, and then link the couple of functions we need statically --- .gitmodules | 3 +++ MANIFEST.in | 12 ++++++++++-- cuda_setup.py | 35 +++++------------------------------ setup.cfg | 2 +- setup.py | 19 ++++++++++++++++--- thirdparty/faiss | 1 + 6 files changed, 36 insertions(+), 36 deletions(-) create mode 100644 .gitmodules create mode 160000 thirdparty/faiss diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..6120fd77 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "thirdparty/faiss"] + path = thirdparty/faiss + url = https://github.com/facebookresearch/faiss.git diff --git a/MANIFEST.in b/MANIFEST.in index 76e9ebb3..63eb856e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,8 +1,16 @@ recursive-include implicit *.h *.cu *.cuh *.pyx *.py *.cpp *.pxd -recursive-exclude tests * -recursive-exclude examples * include cuda_setup.py include README.md include requirements.txt include LICENSE include tox.ini + +recursive-include thirdparty/faiss/faiss/gpu/utils/ *.cu *.cuh *.h +include thirdparty/faiss/faiss/gpu/GpuResources.cpp +include thirdparty/faiss/faiss/gpu/GpuResources.h +include thirdparty/faiss/faiss/gpu/GpuFaissAssert.h +include thirdparty/faiss/faiss/gpu/DeviceUtils.h +include thirdparty/faiss/faiss/impl/FaissAssert.h +include thirdparty/faiss/faiss/impl/FaissException.h +include thirdparty/faiss/faiss/impl/platform_macros.h +include thirdparty/faiss/faiss/MetricType.h diff --git a/cuda_setup.py b/cuda_setup.py index 443a510b..bfd5bd89 100644 --- a/cuda_setup.py +++ b/cuda_setup.py @@ -92,14 +92,8 @@ class _UnixCCompiler(unixccompiler.UnixCCompiler): src_extensions.append(".cu") def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): - # For sources other than CUDA C ones, just call the super class method. - if os.path.splitext(src)[1] != ".cu": - return unixccompiler.UnixCCompiler._compile( - self, obj, src, ext, cc_args, extra_postargs, pp_opts - ) - # For CUDA C source files, compile them with NVCC. - _compiler_so = self.compiler_so + _compiler_so = self.compiler_so # pylint: disable=access-member-before-definition try: nvcc_path = CUDA["nvcc"] post_args = CUDA["post_args"] @@ -110,7 +104,7 @@ def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): self, obj, src, ext, cc_args, post_args, pp_opts ) finally: - self.compiler_so = _compiler_so + self.compiler_so = _compiler_so # pylint: disable=attribute-defined-outside-init class _MSVCCompiler(msvccompiler.MSVCCompiler): @@ -119,7 +113,7 @@ class _MSVCCompiler(msvccompiler.MSVCCompiler): src_extensions = list(unixccompiler.UnixCCompiler.src_extensions) src_extensions.extend(_cu_extensions) - def _compile_cu( + def compile( self, sources, output_dir=None, @@ -151,25 +145,6 @@ def _compile_cu( return objects - def compile(self, sources, **kwargs): - # Split CUDA C sources and others. - cu_sources = [] - other_sources = [] - for source in sources: - if os.path.splitext(source)[1] == ".cu": - cu_sources.append(source) - else: - other_sources.append(source) - - # Compile source files other than CUDA C ones. - other_objects = msvccompiler.MSVCCompiler.compile(self, other_sources, **kwargs) - - # Compile CUDA C sources. - cu_objects = self._compile_cu(cu_sources, **kwargs) - - # Return compiled object filenames. - return other_objects + cu_objects - class cuda_build_ext(setuptools_build_ext): """Custom `build_ext` command to include CUDA C source files.""" @@ -182,7 +157,7 @@ def _wrap_new_compiler(*args, **kwargs): try: return func(*args, **kwargs) except errors.DistutilsPlatformError: - if not sys.platform == "win32": + if sys.platform != "win32": CCompiler = _UnixCCompiler else: CCompiler = _MSVCCompiler @@ -193,7 +168,7 @@ def _wrap_new_compiler(*args, **kwargs): ccompiler.new_compiler = wrap_new_compiler(ccompiler.new_compiler) # Intentionally causes DistutilsPlatformError in # ccompiler.new_compiler() function to hook. - self.compiler = "nvidia" + self.compiler = "nvidia" # pylint: disable=attribute-defined-outside-init setuptools_build_ext.run(self) diff --git a/setup.cfg b/setup.cfg index 4bb5ac10..91133161 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,7 +9,7 @@ description-file = README.md [flake8] max-line-length = 100 exclude = build,.eggs,.tox -ignore = E203 +ignore = E203,W503 [isort] multi_line_output = 3 diff --git a/setup.py b/setup.py index ba8d3d7f..45d3323f 100644 --- a/setup.py +++ b/setup.py @@ -68,17 +68,30 @@ def define_extensions(): ) if CUDA: + faiss_path = os.path.join("thirdparty", "faiss") conda_prefix = os.getenv("CONDA_PREFIX") - include_dirs = [CUDA["include"], "."] + include_dirs = [faiss_path, CUDA["include"], "."] library_dirs = [CUDA["lib64"]] + include_dirs.append(faiss_path) if conda_prefix: include_dirs.append(os.path.join(conda_prefix, "include")) library_dirs.append(os.path.join(conda_prefix, "lib")) + block_select_dir = os.path.join(faiss_path, "faiss", "gpu", "utils", "blockselect") + faiss_paths = [ + os.path.join(block_select_dir, p) + for p in os.listdir(block_select_dir) + if p.endswith(".cu") + ] + faiss_paths.append(os.path.join(faiss_path, "faiss", "gpu", "utils", "BlockSelectFloat.cu")) + faiss_paths.append(os.path.join(faiss_path, "faiss", "gpu", "GpuResources.cpp")) + faiss_paths.append(os.path.join(faiss_path, "faiss", "gpu", "utils", "DeviceUtils.cu")) + modules.append( Extension( "implicit.gpu._cuda", - [ + faiss_paths + + [ os.path.join("implicit", "gpu", "_cuda" + src_ext), os.path.join("implicit", "gpu", "als.cu"), os.path.join("implicit", "gpu", "bpr.cu"), @@ -92,7 +105,7 @@ def define_extensions(): # extra_compile_args=compile_args, extra_link_args=link_args, library_dirs=library_dirs, - libraries=["cudart", "cublas", "curand", "faiss"], + libraries=["cudart", "cublas", "curand"], include_dirs=include_dirs, ) ) diff --git a/thirdparty/faiss b/thirdparty/faiss new file mode 160000 index 00000000..c08cbff1 --- /dev/null +++ b/thirdparty/faiss @@ -0,0 +1 @@ +Subproject commit c08cbff1a4d6c9afb6b8f69004c5530aaf80237a From bd5689d0ee91f9c115f465040bd33b7fe731cc7f Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Thu, 6 Jan 2022 21:35:34 -0800 Subject: [PATCH 3/5] update manifest --- MANIFEST.in | 1 - 1 file changed, 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 63eb856e..b8bfba1f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -9,7 +9,6 @@ recursive-include thirdparty/faiss/faiss/gpu/utils/ *.cu *.cuh *.h include thirdparty/faiss/faiss/gpu/GpuResources.cpp include thirdparty/faiss/faiss/gpu/GpuResources.h include thirdparty/faiss/faiss/gpu/GpuFaissAssert.h -include thirdparty/faiss/faiss/gpu/DeviceUtils.h include thirdparty/faiss/faiss/impl/FaissAssert.h include thirdparty/faiss/faiss/impl/FaissException.h include thirdparty/faiss/faiss/impl/platform_macros.h From 2782cfc84ff9173c9cfea75e7f116d3cbe05213c Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Thu, 6 Jan 2022 22:16:44 -0800 Subject: [PATCH 4/5] Fix rank_items tests --- implicit/gpu/knn.cu | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/implicit/gpu/knn.cu b/implicit/gpu/knn.cu index 30b02ace..0c2d9019 100644 --- a/implicit/gpu/knn.cu +++ b/implicit/gpu/knn.cu @@ -23,6 +23,13 @@ namespace implicit { namespace gpu { namespace { const static int TILE_GROUPS = 32; const static int MAX_TILE_ROWS = 32; + + // faiss seems to have issues when distances contain -FLT_MAX, and can return a '-1' in the + // indices returned, instead of an actual valid row number. When we filter, instead of + // setting to -FLT_MAX, set to the next smallest valid float32 value. + const static float _FLT_MAX = FLT_MAX; + const static uint32_t UINT_FILTER_DISTANCE = (*reinterpret_cast(&_FLT_MAX)) - 1; + const static float FLT_FILTER_DISTANCE = - *reinterpret_cast(&UINT_FILTER_DISTANCE); } bool is_host_memory(void * address) { @@ -193,11 +200,12 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k, int * items = item_filter->data; int items_size = item_filter->size; int cols = temp_distances.cols; + float filter_distance = FLT_FILTER_DISTANCE; thrust::for_each(count, count + items_size * temp_distances.rows, [=] __device__(size_t i) { int col = items[i % items_size]; int row = i / items_size; - data[row * cols + col] = -FLT_MAX; + data[row * cols + col] = filter_distance; }); } @@ -207,10 +215,11 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k, int * col = query_filter->col; float * data = temp_distances.data; int items = temp_distances.cols; + float filter_distance = FLT_FILTER_DISTANCE; thrust::for_each(count, count + query_filter->nonzeros, [=] __device__(int i) { if ((row[i] >= start) && (row[i] < end)) { - data[(row[i] -start) * items + col[i]] = -FLT_MAX; + data[(row[i] - start) * items + col[i]] = filter_distance; } }); } From 8caa5e1d85216aeffb38f974da53d6a6319e0522 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Fri, 7 Jan 2022 10:23:50 -0800 Subject: [PATCH 5/5] Add sm80/sm86 to cuda_Setup.py avoids long ptx JIT times on startup --- cuda_setup.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cuda_setup.py b/cuda_setup.py index bfd5bd89..07f89f81 100644 --- a/cuda_setup.py +++ b/cuda_setup.py @@ -59,16 +59,29 @@ def locate_cuda(): "lib64": os.path.join(home, "lib64"), } - post_args = [ + arch_flags = [ "-arch=sm_60", + "-gencode=arch=compute_37,code=sm_37", + "-gencode=arch=compute_50,code=sm_50", + "-gencode=arch=compute_52,code=sm_52", "-gencode=arch=compute_60,code=sm_60", "-gencode=arch=compute_61,code=sm_61", "-gencode=arch=compute_70,code=sm_70", - "-gencode=arch=compute_70,code=compute_70", + "-gencode=arch=compute_75,code=sm_75", + "-gencode=arch=compute_80,code=sm_80", + "-gencode=arch=compute_86,code=sm_86", + "-gencode=arch=compute_86,code=compute_86", + ] + + # hack to speed up cuda compilation on my devbox + if os.getenv("IMPLICIT_CUDA_ARCH") == "sm86": + arch_flags = ["-arch=sm_86", "-gencode=arch=compute_86,code=sm_86"] + + post_args = [ "--ptxas-options=-v", "--extended-lambda", "-O2", - ] + ] + arch_flags if sys.platform == "win32": cudaconfig["lib64"] = os.path.join(home, "lib", "x64")