native/cuda/DepthwiseConv2d.cu

#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/Dispatch.h>
#include <ATen/AccumulateType.h>
#include <ATen/div_rtn.h>
#include <ATen/cuda/CUDABlas.h>
#include <ATen/cuda/detail/KernelUtils.h>
#include <ATen/native/ConvUtils.h>
#include <ATen/native/cuda/block_reduce.cuh>
#include <ATen/native/Resize.h>
#include <ATen/native/IndexingUtils.h>

#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/empty.h>
#include <ATen/ops/_conv_depthwise2d_native.h>
#endif

namespace at::native {
namespace {
using at::cuda::detail::CUDA_NUM_THREADS;
using at::cuda::detail::GET_BLOCKS;

template <typename scalar_t, int ndim, template <typename U> class PtrTraits = DefaultPtrTraits>
PackedTensorAccessor32<scalar_t, ndim, PtrTraits> dummy_packed_accessor32() {
  std::array<int64_t, ndim> zeros{};
  return {nullptr, zeros.data(), zeros.data()};
}

template <typename scalar_t, typename index_t>
__global__ void
#if !defined(USE_ROCM)
C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS)
#endif
conv_depthwise2d_forward_kernel_generic(
    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
    const PackedTensorAccessor32<const scalar_t, 1, DefaultPtrTraits> bias,
    bool biasEnabled,
    index_t totalElements,
    const int outputChannels,
    const int depthwiseMultiplier,
    const int inputWidth, const int inputHeight,
    const int outputWidth, const int outputHeight,
    const int kernelWidth, const int kernelHeight,
    const int strideWidth, const int strideHeight,
    const int padWidth, const int padHeight,
    const int dilationWidth, const int dilationHeight) {
  using acc_t = at::acc_type<scalar_t, true>;

  CUDA_KERNEL_LOOP_TYPE(linearIndex, totalElements, index_t) {
    //calculate n,c,h,w indices, replacing modulos by divide and multiply add,
    //result is same as would be in the code below
    //const int n = linearIndex / batchStride; //batchStride = outputChannels * outputHeight * outputWidth
    //const int c = (linearIndex / channelStride) % outputChannels; //channelStride = outputHeight * outputWidth
    //const int h = (linearIndex / outputWidth) % outputHeight;
    //const int w = linearIndex % outputWidth;

    int indtmp1 = linearIndex/outputWidth;
    const int w = linearIndex - indtmp1 * outputWidth;
    int indtmp2 = indtmp1/outputHeight;
    const int h = indtmp1 - indtmp2 * outputHeight;
    indtmp1 = indtmp2;
    indtmp2 = indtmp1/outputChannels;
    const int c = indtmp1 - indtmp2 * outputChannels;
    const int n = indtmp2;

    int inputChannel = c;
    int inputChannels = outputChannels;
    if (depthwiseMultiplier !=1) {
      inputChannel /= depthwiseMultiplier;
      inputChannels /= depthwiseMultiplier;
    }

    int weightOffset = c * kernelHeight * kernelWidth;

    // By precisely computing the filtering boundaries, we avoid repeating several
    // expensive edge condition checks for every fetched item. If the input element is
    // resident in L1, then the extra branches and comparisons would have been
    // comparable in terms of cycles with the actual data fetch. Therefore computing
    // boundaries ahead of the loop showed significant performance boost.

    int kHmin = 0, kHmax = kernelHeight, kWmin = 0, kWmax = kernelWidth;

    // Top
    int h_in_min = -padHeight + h * strideHeight;
    if (h_in_min < 0) {
      kHmin =  -h_in_min / dilationHeight;
      if ((-h_in_min) % dilationHeight > 0) {
        kHmin++;
      }
    }

    // Bottom
    int h_in_max = h_in_min + (kernelHeight - 1) * dilationHeight - inputHeight + 1;
    if (h_in_max >= 0) {
      kHmax = kernelHeight - h_in_max / dilationHeight;
      if (h_in_max % dilationHeight > 0) {
        kHmax--;
      }
    }

    // Left
    int w_in_min = -padWidth + w * strideWidth;
    if (w_in_min < 0) {
      kWmin = -w_in_min / dilationWidth;
      if ((-w_in_min) % dilationWidth > 0) {
        kWmin++;
      }
    }

    // Right
    int w_in_max = w_in_min + (kernelWidth - 1) * dilationWidth - inputWidth + 1;
    if (w_in_max >= 0) {
      kWmax = kernelWidth - w_in_max / dilationWidth;
      if (w_in_max % dilationWidth > 0) {
        kWmax--;
      }
    }

    acc_t value = biasEnabled ? static_cast<acc_t>(bias.data()[c]) : acc_t(0);
    const index_t offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth;

    for (int kH = kHmin; kH < kHmax; ++kH) {
      const int h_in = -padHeight + h * strideHeight + kH * dilationHeight;
      for (int kW = kWmin; kW < kWmax; ++kW) {
        const int w_in = -padWidth + w * strideWidth + kW * dilationWidth;
        const index_t offset = offset0 + h_in * inputWidth + w_in;
        value += (static_cast<acc_t>(weight.data()[weightOffset + kH * kernelWidth + kW]) *
                    static_cast<acc_t>(input.data()[offset]));
      }
    }
    output.data()[linearIndex] = static_cast<scalar_t>(value);
  }
}

template <int kSize, typename scalar_t, typename index_t>
__global__ void
#if !defined(USE_ROCM)
C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS)
#endif
conv_depthwise2d_forward_kernel(
    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
    const PackedTensorAccessor32<const scalar_t, 1, DefaultPtrTraits> bias,
    bool biasEnabled,
    index_t totalElements,
    const int outputChannels,
    const int depthwiseMultiplier,
    const int inputWidth, const int inputHeight,
    const int outputWidth, const int outputHeight,
    const int kernelWidth, const int kernelHeight,
    const int strideWidth, const int strideHeight,
    const int padWidth, const int padHeight,
    const int dilationWidth, const int dilationHeight) {
  using acc_t = at::acc_type<scalar_t, true>;
  const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth;
  const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight;

  CUDA_KERNEL_LOOP_TYPE(linearIndex, totalElements, index_t) {
    //calculate n,c,h,w indices, replacing modulos by divide and multiply add,
    //result is same as would be in the code below
    //const int n = linearIndex / batchStride; //batchStride = outputChannels * outputHeight * outputWidth
    //const int c = (linearIndex / channelStride) % outputChannels; //channelStride = outputHeight * outputWidth
    //const int h = (linearIndex / outputWidth) % outputHeight;
    //const int w = linearIndex % outputWidth;

    int indtmp1 = linearIndex/outputWidth;
    const int w = linearIndex - indtmp1 * outputWidth;
    int indtmp2 = indtmp1/outputHeight;
    const int h = indtmp1 - indtmp2 * outputHeight;
    indtmp1 = indtmp2;
    indtmp2 = indtmp1/outputChannels;
    const int c = indtmp1 - indtmp2 * outputChannels;
    const int n = indtmp2;

    int inputChannel = c;
    int inputChannels = outputChannels;
    if (depthwiseMultiplier !=1) {
      inputChannel /= depthwiseMultiplier;
      inputChannels /= depthwiseMultiplier;
    }

    int weightOffset = c * kernelHeight * kernelWidth;

    acc_t value = biasEnabled ? static_cast<acc_t>(bias.data()[c]) : acc_t(0);
    const index_t offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth;
#if !defined(USE_ROCM)
#pragma unroll
#endif
    for (int kH = 0; kH < KH_LIMIT; ++kH) {
#if !defined(USE_ROCM)
#pragma unroll
#endif
      for (int kW = 0; kW < KW_LIMIT; ++kW) {
        const int h_in = -padHeight + h * strideHeight + kH * dilationHeight;
        const int w_in = -padWidth + w * strideWidth + kW * dilationWidth;

        if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && (w_in < inputWidth)) {
          const index_t offset = offset0 + h_in * inputWidth + w_in;
          value += (static_cast<acc_t>(weight.data()[weightOffset]) *
                    static_cast<acc_t>(input.data()[offset]));
        }
        ++weightOffset;
      }
    }
    output.data()[linearIndex] = static_cast<scalar_t>(value);
  }
}

template <int kSize, int stride, typename scalar_t, typename index_t>
#if !defined(USE_ROCM)
C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS)
#endif
__global__ void conv_depthwise2d_backward_kernel(
    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> grad_output,
    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> grad_input,
    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
    index_t totalElements,
    const int inputChannels,
    const int depthwiseMultiplier,
    const int outputChannels,
    const int inputWidth, const int inputHeight,
    const int outputWidth, const int outputHeight,
    const int kernelWidth, const int kernelHeight,
    const int strideWidth, const int strideHeight,
    const int padWidth, const int padHeight,
    const int dilationWidth, const int dilationHeight) {
  using acc_t = at::acc_type<scalar_t, true>;
  const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth;
  const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight;
  const int strideW = (stride != 0) ? stride : strideWidth;
  const int strideH = (stride != 0) ? stride : strideHeight;

  CUDA_KERNEL_LOOP_TYPE(linearIndex, totalElements, index_t) {
    int indtmp1 = linearIndex/inputWidth;
    const int w = linearIndex - indtmp1 * inputWidth;
    int indtmp2 = indtmp1/inputHeight;
    const int h = indtmp1 - indtmp2 * inputHeight;
    indtmp1 = indtmp2;
    indtmp2 = indtmp1/inputChannels;
    const int c = indtmp1 - indtmp2 * inputChannels;
    const int n = indtmp2;

    acc_t value(0);

    for (int multiplier = 0; multiplier < depthwiseMultiplier; ++multiplier) {
      int och = (c * depthwiseMultiplier) + multiplier;
      int weightOffset = och * kernelHeight * kernelWidth;
      for (int kh = 0; kh < KH_LIMIT; ++kh) {
#if !defined(USE_ROCM)
#pragma unroll
#endif
        for (int kw = 0; kw < KW_LIMIT; ++kw) {
          int h_out = h + padHeight - kh * dilationHeight;
          int w_out = w + padWidth - kw * dilationWidth;
          if ((h_out % strideH == 0) && (w_out % strideW == 0)) {
            h_out = h_out / strideH;
            w_out = w_out / strideW;

            if ((h_out >= 0) && (h_out < outputHeight)
                  && (w_out >= 0) && (w_out < outputWidth)) {

              const int offset = ((n * outputChannels + och) * outputHeight + h_out)
                    * outputWidth + w_out;
              value += (static_cast<acc_t>(weight.data()[weightOffset]) *
                        static_cast<acc_t>(grad_output.data()[offset]));
            }
          }
          ++weightOffset;
        }
      }
    }
    grad_input.data()[linearIndex] = static_cast<scalar_t>(value);
  }
}

template <typename scalar_t, typename index_t=unsigned>
__global__ void conv_depthwise2d_grad_weight_kernel(
    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> grad_output,
    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> grad_weight,
    const int batchSize,
    const int inputChannels,
    const int kernelChannels,
    const int depthwiseMultiplier,
    const int inputWidth, const int inputHeight,
    const int outputWidth, const int outputHeight,
    const int kernelWidth, const int kernelHeight,
    const int strideWidth, const int strideHeight,
    const int padWidth, const int padHeight,
    const int dilationWidth, const int dilationHeight) {
  using acc_t = at::acc_type<scalar_t, true>;
  const int channelStride = kernelWidth * kernelHeight;

  // Each Block is responsible for accumulating over a permutation of
  // (channels x kH x kW), use blockIdx to determine which one
  int bidx = blockIdx.x;
  int kW = bidx % kernelWidth;
  int kH = (bidx / kernelWidth) % kernelHeight;
  int ch = (bidx / channelStride);

  // Need to calculate which input channel is associated with this filter
  // channel
  int inputCh = ch / depthwiseMultiplier;

  acc_t grad(0);

  const int laneId = threadIdx.x % C10_WARP_SIZE;
  const int batch = threadIdx.x / C10_WARP_SIZE;
  const int nwarps = blockDim.x / C10_WARP_SIZE;
  const int imageElements = outputWidth * outputHeight;
  // Use warp per item.  In the original kernel, a threadblock was used to sum over NHW.
  // Here, we use a warp to sum values over HW dimension, and if batchSize is larger than the
  // number of warps, a warp would loop over remaining batch items (e.g. if there are 8 warps,
  // warp 0 would go over 0-8-16 etc image, warp 1 over 1-9-17 etc). Later in blockReduce,
  // all the warps will be reduced anyway, thus the full reduction will be over NHW, like it
  // should be. That allows to get rid of one modulo operation inside the loop (because n/batchIdx
  // now does not have to be computed through modulo, you are just looping over it), and
  // bring a nice speed-up.
  for (int batchIdx = batch; batchIdx < batchSize; batchIdx += nwarps){
    // Warp-stride loop over elements in a batch item
    for (index_t idx = laneId; idx < imageElements; idx += C10_WARP_SIZE) {
    // Need to calculate the following: batch position, and offset into the grad_output
    // in height, and width. We can intuit the corresponding position in the input from
    // the other parameters we have
      int go_w_offset = idx % outputWidth;
      int go_h_offset = (idx / outputWidth);

      int i_w_offset = (go_w_offset * strideWidth) + (kW * dilationWidth) - padWidth;
      int i_h_offset = (go_h_offset * strideHeight) + (kH * dilationHeight) - padHeight;

      if (i_w_offset >= 0 && i_h_offset >= 0 && i_w_offset < inputWidth && i_h_offset < inputHeight) {
        int inputOffset = ((batchIdx * inputChannels + inputCh) * inputHeight + i_h_offset) * inputWidth + i_w_offset;
        int outputOffset = ((batchIdx * kernelChannels + ch) * outputHeight ) * outputWidth + idx;
        grad += (static_cast<acc_t>(input.data()[inputOffset]) *
                 static_cast<acc_t>(grad_output.data()[outputOffset]));
      }
    }
  }

  // At this point each thread in the block has a local gradient, which we need to
  // accumulate prior to writing the global value
  extern __shared__ char smem[];
  acc_t* buf = reinterpret_cast<acc_t*>(smem);
  acc_t tval = cuda_utils::BlockReduceSum(grad, buf);

  // After reduction, first thread in the block has the gradient, so its responsible
  // for writing it to grad_weight
  if (threadIdx.x == 0) {
    int weightOffset = kW + (kernelWidth * kH) + (kernelWidth * kernelHeight * ch);
    grad_weight.data()[weightOffset] = static_cast<scalar_t>(tval);
  }
}

void conv_depthwise2d_forward_out(
                  const Tensor &input,
                  const Tensor &output,
                  const Tensor &weight,
                  const Tensor &bias,
                  const int kW, const int kH,
                  const int dW, const int dH,
                  const int padW, const int padH,
                  const int dilationW, const int dilationH) {
  // Only handle 4D Input Tensors for now
  TORCH_CHECK(input.numel() > 0 && input.dim() == 4);
  TORCH_CHECK(weight.numel() > 0 && weight.dim() == 4);
  TORCH_CHECK(output.is_contiguous());

  auto in_sizes = input.sizes();
  auto w_sizes = weight.sizes();

  // We assume that the input and weight Tensors are shaped properly by
  // the caller, so we verify that here to some extent

  // Weight Tensor is shape (output_channels, 1, kH, kW)
  TORCH_CHECK(w_sizes[1] == 1);

  // Input Tensor is shape (N, input_channels, H, W)
  // We verify that the # of output_channels is a multiple of input_channels
  TORCH_CHECK(w_sizes[0] % in_sizes[1] == 0);

  // Bias has same # of channels as output
  const bool has_bias = bias.defined();
  TORCH_CHECK(!has_bias || (bias.dim() <= 1 && bias.numel() == w_sizes[0]));

  // Following the behavior of other THCUNN functions, we shape the output
  // Tensor ourselves
  int64_t height = in_sizes[2];
  int64_t width = in_sizes[3];
  int64_t outputChannels = w_sizes[0];
  auto out_sizes = conv_output_size(in_sizes, weight.sizes(), {padH, padW}, {dH, dW},
                                    {dilationH, dilationW});
  const auto outputWidth = out_sizes[3];
  const auto outputHeight = out_sizes[2];

  resize_output(output, out_sizes);

  int64_t inputChannels = in_sizes[1];
  int64_t depthwiseMultiplier = outputChannels / inputChannels;

  // One thread per output value
  TORCH_CHECK(canUse32BitIndexMath(input) && canUse32BitIndexMath(output));
  int32_t n = output.numel();
  int blocks = GET_BLOCKS(n);
  dim3 grid(blocks);
  dim3 block(CUDA_NUM_THREADS);
  const auto stream = c10::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
                                  "conv_depthwise2d_forward_cuda", [&] {
    // Create PackedTensorAccessor
    // Kernel currently relies upon all the Tensors to be contiguous, but we made
    // them contiguous above
    const auto input_a = input.packed_accessor32<const scalar_t, 4>();
    const auto weight_a = weight.packed_accessor32<const scalar_t, 4>();
    const auto output_a = output.packed_accessor32<scalar_t, 4>();
    const auto bias_a = has_bias ?
      bias.packed_accessor32<const scalar_t, 1>() :
      dummy_packed_accessor32<const scalar_t, 1>();
    if (kW == 5 && kH == 5) {
      conv_depthwise2d_forward_kernel<5> <<<grid, block, 0, stream>>>(
        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
        width, height, outputWidth, outputHeight,
        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    } else if (kW == 3 && kH == 3) {
      conv_depthwise2d_forward_kernel<3> <<<grid, block, 0, stream>>>(
        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
        width, height, outputWidth, outputHeight,
        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    } else if (kW == 1 && kH == 1) {
      conv_depthwise2d_forward_kernel<1> <<<grid, block, 0, stream>>>(
        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
        width, height, outputWidth, outputHeight,
        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    } else {
      conv_depthwise2d_forward_kernel_generic<<<grid, block, 0, stream>>>(
        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
        width, height, outputWidth, outputHeight,
        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    }
  });
}

void conv_depthwise2d_backward_out(
                  const Tensor &input,
                  const Tensor &grad_output,
                  const Tensor &grad_input,
                  const Tensor &weight,
                  const int kW, const int kH,
                  const int dW, const int dH,
                  const int padW, const int padH,
                  const int dilationW, const int dilationH) {
  // Only handle 4D Input Tensors for now
  TORCH_CHECK(input.numel() > 0 && input.dim() == 4);
  TORCH_CHECK(weight.numel() > 0 && weight.dim() == 4);
  TORCH_CHECK(grad_output.numel() > 0 && grad_output.dim() == 4);

  // Minimal shape checking, as above
  // Same # of elements in batch
  TORCH_CHECK(input.sizes()[0] == grad_output.sizes()[0]);
  // Same # of filters as outputChannels
  TORCH_CHECK(weight.sizes()[0] == grad_output.sizes()[1]);

  // Resize Grainput_a
  auto in_sizes = input.sizes();
  resize_output(grad_input, in_sizes);

  int inputChannels = in_sizes[1];
  int height = in_sizes[2];
  int width = in_sizes[3];

  auto gO_sizes = grad_output.sizes();
  int outputChannels = gO_sizes[1];
  int outputHeight = gO_sizes[2];
  int outputWidth = gO_sizes[3];

  int depthwiseMultiplier = outputChannels / inputChannels;

  // Kernel currently relies upon all the Tensors to be contiguous
  TORCH_CHECK(grad_output.is_contiguous());
  TORCH_CHECK(weight.is_contiguous());
  TORCH_CHECK(grad_input.is_contiguous());

  // One thread per grainput_a value
  TORCH_CHECK(canUse32BitIndexMath(grad_input) &&
              canUse32BitIndexMath(grad_output));
  int32_t n = grad_input.numel();
  int blocks = GET_BLOCKS(n);
  dim3 grid(blocks);
  dim3 block(CUDA_NUM_THREADS);
  const auto stream = c10::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_output.scalar_type(),
                                  "conv_depthwise2d_backward_cuda", [&] {
    auto grad_output_a = grad_output.packed_accessor32<const scalar_t, 4>();
    auto grad_input_a = grad_input.packed_accessor32<scalar_t, 4>();
    auto weight_a = weight.packed_accessor32<const scalar_t, 4>();

    if (kW == 5 && kH == 5) {
      if (dW == 1 && dH == 1){
        conv_depthwise2d_backward_kernel<5, 1><<<grid, block, 0, stream>>>(
            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      } else if (dW == 2 && dH == 2) {
        conv_depthwise2d_backward_kernel<5, 2><<<grid, block, 0, stream>>>(
            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      } else {
        conv_depthwise2d_backward_kernel<5, 0><<<grid, block, 0, stream>>>(
            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      }
    } else if (kW == 3 && kH == 3) {
      if (dW == 1 && dH == 1){
        conv_depthwise2d_backward_kernel<3, 1><<<grid, block, 0, stream>>>(
            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      } else if (dW == 2 && dH == 2) {
        conv_depthwise2d_backward_kernel<3, 2><<<grid, block, 0, stream>>>(
            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      } else {
        conv_depthwise2d_backward_kernel<3, 0><<<grid, block, 0, stream>>>(
            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      }
    } else if (kW == 1 && kH == 1) {
      if (dW == 1 && dH == 1){
        conv_depthwise2d_backward_kernel<1, 1><<<grid, block, 0, stream>>>(
            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      } else if (dW == 2 && dH == 2) {
        conv_depthwise2d_backward_kernel<1, 2><<<grid, block, 0, stream>>>(
            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      } else {
        conv_depthwise2d_backward_kernel<1, 0><<<grid, block, 0, stream>>>(
            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      }
    } else if (dW == 1 && dH == 1) {
      conv_depthwise2d_backward_kernel<0, 1><<<grid, block, 0, stream>>>(
          grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
          height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    } else if (dW == 2 && dH == 2) {
      conv_depthwise2d_backward_kernel<0, 2><<<grid, block, 0, stream>>>(
          grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
          height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    } else {
      conv_depthwise2d_backward_kernel<0, 0><<<grid, block, 0, stream>>>(
          grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
          height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    }
  });
}

// Crude benchmarks suggest 256 is better than 512 and 1024
// TODO: Autotune/use better heuristics, improve speed more.
int getGradParamsNumThreads(int batchSize) {
  //warp per item in a batch, up to a maximum
  constexpr int MAX_BLOCK_SIZE = 256;
  return std::min(batchSize * at::cuda::warp_size(), MAX_BLOCK_SIZE);
}

void conv_depthwise2d_grad_weight_out(
                  const Tensor &input,
                  const Tensor &grad_output,
                  const Tensor &grad_weight,
                  const int kW, const int kH,
                  const int dW, const int dH,
                  const int padW, const int padH,
                  const int dilationW, const int dilationH) {
  // Only handle 4D Input Tensors for now
  TORCH_CHECK(input.numel() > 0 && input.dim() == 4);
  TORCH_CHECK(grad_output.numel() > 0 && grad_output.dim() == 4);

  // Minimal shape checking as above
  // Same # of elements in batch
  TORCH_CHECK(input.sizes()[0] == grad_output.sizes()[0]);

  auto in_sizes = input.sizes();
  int batchSize = in_sizes[0];
  int inputChannels = in_sizes[1];
  int height = in_sizes[2];
  int width = in_sizes[3];

  auto gO_sizes = grad_output.sizes();
  int outputChannels = gO_sizes[1];
  int outputHeight = gO_sizes[2];
  int outputWidth = gO_sizes[3];

  int depthwiseMultiplier = outputChannels / inputChannels;

  resize_output(grad_weight, {outputChannels, 1, kH, kW});

  // Kernel currently relies upon all the Tensors to be contiguous
  TORCH_CHECK(grad_output.is_contiguous());
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(grad_weight.is_contiguous());

  // We parallelize so that each block computes a single value in grad_weight
  TORCH_CHECK(canUse32BitIndexMath(input) &&
              canUse32BitIndexMath(grad_output));
  int blocks = outputChannels * kH * kW;

  // Make sure we have enough threads to perform the reduction, and use this number
  // to create the shared memory size for the reduction
  dim3 grid(blocks);
  dim3 block(getGradParamsNumThreads(batchSize));
  const auto stream = c10::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_output.scalar_type(),
                                  "conv_depthwise2d_grad_weight_cuda", [&] {
    const auto grad_output_a = grad_output.packed_accessor32<const scalar_t, 4>();
    const auto input_a = input.packed_accessor32<const scalar_t, 4>();
    const auto grad_weight_a = grad_weight.packed_accessor32<scalar_t, 4>();
    using acc_t = at::acc_type<scalar_t, true>;
    int warp_size = at::cuda::warp_size();
    TORCH_INTERNAL_ASSERT(block.x % warp_size == 0);
    int smem = (block.x  / warp_size) * sizeof(acc_t);
    conv_depthwise2d_grad_weight_kernel<<<grid, block, smem, stream>>>(
        grad_output_a, input_a, grad_weight_a, batchSize, inputChannels, outputChannels, depthwiseMultiplier,
        width, height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  });
}

}  // namespace (anonymous)

const Tensor& conv_depthwise2d_cuda_out(
    const Tensor &input_,
    const Tensor &weight_,
    IntArrayRef kernel_size,
    const std::optional<Tensor> &bias_opt,
    IntArrayRef stride,
    IntArrayRef padding,
    IntArrayRef dilation,
    const Tensor &out) {
  TORCH_CHECK(kernel_size.size() == 2);
  TORCH_CHECK(stride.size() == 2);
  TORCH_CHECK(padding.size() == 2);
  TORCH_CHECK(dilation.size() == 2);

  auto input = input_.expect_contiguous();
  auto weight = weight_.expect_contiguous();
  auto bias = [&] {
    if (bias_opt.has_value() && bias_opt->defined()) {
      return bias_opt->expect_contiguous();
    }
    return c10::MaybeOwned<Tensor>::owned(std::in_place);
  }();

  conv_depthwise2d_forward_out(
      *input,
      out,
      *weight,
      *bias,
      kernel_size[1], kernel_size[0],
      stride[1], stride[0],
      padding[1], padding[0],
      dilation[1], dilation[0]);
  return out;
}

Tensor conv_depthwise2d_cuda(
    const Tensor &input,
    const Tensor &weight,
    IntArrayRef kernel_size,
    const std::optional<Tensor> &bias,
    IntArrayRef stride,
    IntArrayRef padding,
    IntArrayRef dilation) {
  auto out = at::empty({0}, input.options());
  return conv_depthwise2d_cuda_out(input, weight, kernel_size, bias,
                                   stride, padding, dilation, out);
}

std::tuple<Tensor&, Tensor&> conv_depthwise2d_backward_cuda_out(
    const Tensor & grad_output_,
    const Tensor & self_,
    const Tensor & weight_,
    IntArrayRef kernel_size,
    IntArrayRef stride,
    IntArrayRef padding,
    IntArrayRef dilation,
    Tensor & grad_input,
    Tensor & grad_weight) {
  auto grad_output = grad_output_.expect_contiguous();

  if (grad_weight.defined()) {
    auto self = self_.expect_contiguous();
    conv_depthwise2d_grad_weight_out(
        *self, *grad_output, grad_weight,
        kernel_size[1], kernel_size[0],
        stride[1], stride[0],
        padding[1], padding[0],
        dilation[1], dilation[0]);
  }

  if (grad_input.defined()) {
    auto weight = weight_.expect_contiguous();
    conv_depthwise2d_backward_out(
        self_, *grad_output, grad_input, *weight,
        kernel_size[1], kernel_size[0],
        stride[1], stride[0],
        padding[1], padding[0],
        dilation[1], dilation[0]);
  }
  return std::forward_as_tuple(grad_input, grad_weight);
}

std::tuple<Tensor, Tensor> conv_depthwise2d_backward_cuda(
    const Tensor& grad_output,
    const Tensor& self,
    const Tensor& weight,
    IntArrayRef kernel_size,
    IntArrayRef stride,
    IntArrayRef padding,
    IntArrayRef dilation,
    std::array<bool, 2> output_mask) {
  Tensor grad_input;
  Tensor grad_weight;

  if (output_mask[0]) {
    grad_input = at::empty({0}, grad_output.options());
  }

  if (output_mask[1]) {
    grad_weight = at::empty({0}, grad_output.options());
  }
  return conv_depthwise2d_backward_cuda_out(
      grad_output,
      self,
      weight,
      kernel_size,
      stride,
      padding,
      dilation,
      grad_input,
      grad_weight);
}

REGISTER_CUDA_DISPATCH(conv_depthwise2d_backward_stub, &conv_depthwise2d_backward_cuda);

} // namespace at::native