From 982df9c7a0cf741221e39d4765e3553cad68f472 Mon Sep 17 00:00:00 2001
From: Nianxiang Fu <nianxiangfu@whu.edu.cn>
Date: Sat, 4 Jan 2025 17:28:09 +0800
Subject: [PATCH 1/3] Extend the GridSample operator to support Bicubic mode

---
 converter/main.py                     |   2 +-
 sadl/interpolation_utils.h            | 252 ++++++++++++++++++++++++++
 sadl/layer_gridsample.h               | 204 +++++++++++++++++++--
 utests/check.sh                       |   2 +-
 utests/models/gridsample_bicubic.onnx |  26 +++
 5 files changed, 470 insertions(+), 16 deletions(-)
 create mode 100644 utests/models/gridsample_bicubic.onnx

diff --git a/converter/main.py b/converter/main.py
index 09cf4ae..649535b 100644
--- a/converter/main.py
+++ b/converter/main.py
@@ -1245,7 +1245,7 @@ def parse_graph_node(
         mode = getAttribute(node, "mode").s.decode("utf-8")
         padding_mode = getAttribute(node, "padding_mode").s.decode("utf-8")
 
-        mode_list = ["nearest", "bilinear"]
+        mode_list = ["nearest", "bilinear", "bicubic"]
         if not mode in mode_list:
             quit("[ERROR] Currently, the mode of GridSample must in", mode_list, node)
         else:
diff --git a/sadl/interpolation_utils.h b/sadl/interpolation_utils.h
index 695f3df..fbb415e 100644
--- a/sadl/interpolation_utils.h
+++ b/sadl/interpolation_utils.h
@@ -62,6 +62,23 @@ namespace layers
   (void) coeffs
 #endif
 
+#if DEBUG_COUNTERS
+#define BICUBIC_COUNTERS(data, coeffs)  \
+  int C = data.dims()[3];               \
+  for (int im_c = 0; im_c < C; im_c++)  \
+  {                                     \
+    for (int i = 0; i < 16; i ++)       \
+    {                                   \
+      COUNTERS_MAC(coeffs[i]);          \
+    }                                   \
+    COUNTERS(0);                        \
+  }
+#else
+#define BICUBIC_COUNTERS(data, coeffs)  \
+  (void) data;                          \
+  (void) coeffs
+#endif
+
 // ////////////////////////////////////////////////////////////////////////////////////////////////////////
 // bilinear interpolation utils
 // ////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -321,5 +338,240 @@ void nearest_in_channels(const Tensor<T> &data, const int im_i, const int im_j,
     out(im_nb, im_i, im_j, im_c) = data(im_nb, y, x, im_c);   // same data type
   }
 }
+
+// ////////////////////////////////////////////////////////////////////////////////////////////////////////
+// bicubic interpolation utils
+// ////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T, typename T2>
+void bicubic_in_channels_wo_simd(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out);
+
+#if __AVX2__
+template<typename T, typename T2>
+void bicubic_in_channels_simd256(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out);
+#endif
+
+#if __AVX512F__ || __AVX512BW__
+template<typename T, typename T2>
+void bicubic_in_channels_simd512(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out);
+#endif
+
+template<typename T, typename T2>
+void bicubic_in_channels(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out)
+{
+#if __AVX2__
+  int in_D = data.dims()[3];
+#if __AVX512F__
+  if (in_D % 16 == 0)   // same for float and int16
+    bicubic_in_channels_simd512(data, coeffs, pos, shift, im_i, im_j, out);
+  else if (in_D % 8 == 0)   // same for float and int16
+    bicubic_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out);
+  else
+    bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out);
+#else
+  if (in_D % 8 == 0)
+    bicubic_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out);
+  else
+    bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out);
+#endif
+#else
+  bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out);
+#endif
+}
+
+template<typename T, typename T2>
+void bicubic_in_channels_wo_simd(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out)
+{
+  constexpr int im_nb      = 0;
+  int           in_D       = data.dims()[3];
+
+  static std::vector<T2> temp_buffer;
+  temp_buffer.resize(in_D);
+  std::fill(temp_buffer.begin(), temp_buffer.end(), (T2) 0);
+
+  for (int coeff_i = 0; coeff_i < 16; coeff_i++)
+  {
+    const T2  coeff = coeffs[coeff_i];
+    const int pos_y = pos[coeff_i][0];
+    const int pos_x = pos[coeff_i][1];
+    for (int im_c = 0; im_c < in_D; im_c++)
+    {
+      temp_buffer[im_c] += coeff * data(im_nb, pos_y, pos_x, im_c);
+    }
+  }
+  for (int im_c = 0; im_c < in_D; im_c++)
+  {
+    T2 num = temp_buffer[im_c];
+    ComputationType<T>::quantize(num, shift);
+    SATURATE(num);
+    out(im_nb, im_i, im_j, im_c) = static_cast<T>(num);
+  }
+}
+
+#if __AVX2__
+template<>
+inline void bicubic_in_channels_simd256(const Tensor<float> &data, const float coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j,
+                                        Tensor<float> &out)
+{
+  constexpr int im_nb = 0;
+  int           in_D  = data.dims()[3];
+  assert(in_D % 8 == 0);   // Should be used with mod8 data.
+
+  static std::vector<float> temp_buffer;
+  temp_buffer.resize(in_D);
+  std::fill(temp_buffer.begin(), temp_buffer.end(), 0.f);
+
+  for (int coeff_i = 0; coeff_i < 16; coeff_i++)
+  {
+    const __m256 c0   = _mm256_set1_ps(coeffs[coeff_i]);
+    const float *dptr = data.addr(im_nb, pos[coeff_i][0], pos[coeff_i][1], 0);
+    const float *bptr = &temp_buffer[0];
+    for (int im_c = 0; im_c < in_D; im_c += 8)
+    {
+      const __m256 d0 = _mm256_loadu_ps(dptr + im_c);
+      const __m256 b0 = _mm256_loadu_ps(bptr + im_c);
+#if __FMA__
+      const __m256 s = _mm256_fmadd_ps(c0, d0, b0);
+#else
+      const __m256 s = _mm256_add_ps(_mm256_mul_ps(c0, d0), b0);
+#endif
+      _mm256_storeu_ps((float *) (bptr + im_c), s);
+    }
+  }
+  for (int im_c = 0; im_c < in_D; im_c++)
+  {
+    out(im_nb, im_i, im_j, im_c) = temp_buffer[im_c];
+  }
+}
+
+template<>
+inline void bicubic_in_channels_simd256(const Tensor<int16_t> &data, const int32_t coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j,
+                                        Tensor<int16_t> &out)
+{
+  constexpr int im_nb = 0;
+  int           in_D  = data.dims()[3];
+  assert(in_D % 8 == 0);   // Should be used with mod8 data.
+#if DEBUG_COUNTERS || SATURATE_RESULT
+  using T = int16_t;
+#endif
+
+  static std::vector<int32_t> temp_buffer;
+  temp_buffer.resize(in_D);
+  std::fill(temp_buffer.begin(), temp_buffer.end(), 0);
+
+  for (int coeff_i = 0; coeff_i < 16; coeff_i++)
+  {
+    const __m256i  c0   = _mm256_set1_epi32(coeffs[coeff_i]);
+    const int16_t *dptr = data.addr(im_nb, pos[coeff_i][0], pos[coeff_i][1], 0);
+    const int32_t *bptr = &temp_buffer[0];
+    for (int im_c = 0; im_c < in_D; im_c += 8)
+    {
+      const __m256i d0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i *) (dptr + im_c)));
+      const __m256i b0 = _mm256_loadu_si256((__m256i *) (bptr + im_c));
+      const __m256i s  = _mm256_add_epi32(_mm256_mullo_epi32(c0, d0), b0);   // res in int32
+      _mm256_storeu_si256((__m256i *) (bptr + im_c), s);
+    }
+  }
+  for (int im_c = 0; im_c < in_D; im_c++)
+  {
+    int32_t num = temp_buffer[im_c];
+    ComputationType<int16_t>::quantize(num, shift);
+    SATURATE(num);
+    out(im_nb, im_i, im_j, im_c) = num;
+  }
+}
+
+template<typename T, typename T2>
+void bicubic_in_channels_simd256(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out)
+{
+  std::cerr << "TODO " << std::endl;
+  exit(-1);
+}
+#endif
+
+#if __AVX512F__
+template<>
+inline void bicubic_in_channels_simd512(const Tensor<float> &data, const float coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j,
+                                        Tensor<float> &out)
+{
+  constexpr int im_nb = 0;
+  int           in_D  = data.dims()[3];
+  assert(in_D % 16 == 0);   // Should be used with mod16 data.
+
+  static std::vector<float> temp_buffer;
+  temp_buffer.resize(in_D);
+  std::fill(temp_buffer.begin(), temp_buffer.end(), 0.f);
+
+  for (int coeff_i = 0; coeff_i < 16; coeff_i++)
+  {
+    const __m512 c0   = _mm512_set1_ps(coeffs[coeff_i]);
+    const float *dptr = data.addr(im_nb, pos[coeff_i][0], pos[coeff_i][1], 0);
+    const float *bptr = &temp_buffer[0];
+    for (int im_c = 0; im_c < in_D; im_c += 16)
+    {
+      const __m512 d0 = _mm512_loadu_ps(dptr + im_c);
+      const __m512 b0 = _mm512_loadu_ps(bptr + im_c);
+#if __FMA__
+      const __m512 s = _mm512_fmadd_ps(c0, d0, b0);
+#else
+      const __m512 s = _mm512_add_ps(_mm512_mul_ps(c0, d0), b0);
+#endif
+      _mm512_storeu_ps((float *) (bptr + im_c), s);
+    }
+  }
+  for (int im_c = 0; im_c < in_D; im_c++)
+  {
+    out(im_nb, im_i, im_j, im_c) = temp_buffer[im_c];
+  }
+}
+#endif
+
+#if __AVX512BW__
+template<>
+inline void bicubic_in_channels_simd512(const Tensor<int16_t> &data, const int32_t coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j,
+                                        Tensor<int16_t> &out)
+{
+  constexpr int im_nb = 0;
+  int           in_D  = data.dims()[3];
+  assert(in_D % 16 == 0);   // Should be used with mod16 data.
+#if DEBUG_COUNTERS || SATURATE_RESULT
+  using T = int16_t;
+#endif
+
+  static std::vector<int32_t> temp_buffer;
+  temp_buffer.resize(in_D);
+  std::fill(temp_buffer.begin(), temp_buffer.end(), 0);
+
+  for (int coeff_i = 0; coeff_i < 16; coeff_i++)
+  {
+    const __m512i  c0   = _mm512_set1_epi32(coeffs[coeff_i]);
+    const int16_t *dptr = data.addr(im_nb, pos[coeff_i][0], pos[coeff_i][1], 0);
+    const int32_t *bptr = &temp_buffer[0];
+    for (int im_c = 0; im_c < in_D; im_c += 16)
+    {
+      const __m512i d0 = _mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i *) (dptr + im_c)));
+      const __m512i b0 = _mm512_loadu_si512(bptr + im_c);
+      const __m512i s  = _mm512_add_epi32(_mm512_mullo_epi32(c0, d0), b0);   // res in int32
+      _mm512_storeu_si512((void *) (bptr + im_c), s);
+    }
+  }
+  for (int im_c = 0; im_c < in_D; im_c++)
+  {
+    int32_t num = temp_buffer[im_c];
+    ComputationType<int16_t>::quantize(num, shift);
+    SATURATE(num);
+    out(im_nb, im_i, im_j, im_c) = num;
+  }
+}
+#endif
+
+#if __AVX512BW__ || __AVX512F__
+template<typename T, typename T2>
+void bicubic_in_channels_simd512(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out)
+{
+  std::cerr << "TODO " << std::endl;
+  exit(-1);
+}
+#endif
+
 }   // namespace layers
 }   // namespace sadl
diff --git a/sadl/layer_gridsample.h b/sadl/layer_gridsample.h
index 4f368bb..2c03a8c 100644
--- a/sadl/layer_gridsample.h
+++ b/sadl/layer_gridsample.h
@@ -53,21 +53,26 @@ protected:
   virtual bool loadInternal(std::istream &file, Version) override;
   bool         gridsample_nearest(std::vector<Tensor<T> *> &in);
   bool         gridsample_bilinear(std::vector<Tensor<T> *> &in);
+  bool         gridsample_bicubic(std::vector<Tensor<T> *> &in);
   void         get_bilinear_coeffs(T2 y, T2 x, int pos[], T2 coeffs[]);
+  void         get_bicubic_coeffs(T2 y, T2 x, T2 coeffs[]);
+  void         calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]);
+  T2           cubic_coeffs(T2 x, int i);
   void         gs_denormalize(T2 &x, int length);
   void         pixel_addr_at_grid(int H, int W, int &i, int &j);
 
   enum gridsample_mode
   {
     gridsample_mode_nearest  = 0,
-    gridsample_mode_bilinear = 1
+    gridsample_mode_bilinear = 1,
+    gridsample_mode_bicubic  = 2
   };
   enum gridsample_paddingmode
   {
     gridsample_paddingmode_border = 0
   };
   int m_align_corners;   // 0: False, 1: True
-  int m_mode;            // 0: "nearest", 1: "bilinear"
+  int m_mode;            // 0: "nearest", 1: "bilinear", 2: "bicubic"
   int m_padding_mode;    // 0: "border"
   int m_grid_quantizer;
   DUMP_MODEL_EXT;
@@ -85,12 +90,7 @@ template<typename T> bool GridSample<T>::loadInternal(std::istream &file, Versio
   file.read((char *) &x, sizeof(x));
   m_padding_mode = x;
   SADL_DBG(std::cout << "  - padding_mode: " << m_padding_mode << std::endl);
-  if (m_align_corners != 1)
-  {
-    std::cerr << "[ERROR] invalid align corners: " << m_align_corners << std::endl;
-    return false;
-  }
-  if (m_mode != 0 && m_mode != 1)
+  if (m_mode != 0 && m_mode != 1 && m_mode != 2)
   {
     std::cerr << "[ERROR] invalid mode: " << m_mode << std::endl;
     return false;
@@ -124,6 +124,8 @@ template<typename T> bool GridSample<T>::apply(std::vector<Tensor<T> *> &in)
     return gridsample_nearest(in);
   else if (m_mode == gridsample_mode_bilinear)
     return gridsample_bilinear(in);
+  else if (m_mode == gridsample_mode_bicubic)
+    return gridsample_bicubic(in);
 
   return false;
 }
@@ -210,16 +212,66 @@ template<typename T> bool GridSample<T>::gridsample_bilinear(std::vector<Tensor<
   return true;
 }
 
+template<typename T> bool GridSample<T>::gridsample_bicubic(std::vector<Tensor<T> *> &in)
+{
+  // Given an input data and a flow-field grid, computes the output Y using data
+  // values and pixel locations from the grid.
+  const Tensor<T> &data = *in[0];   // (1,H_in,W_in,C)
+  const Tensor<T> &grid = *in[1];   // (1,W_out,2,H_out)
+  m_out.quantizer       = data.quantizer;
+  m_grid_quantizer      = grid.quantizer;
+  assert(data.dims()[0] == 1 && grid.dims()[0] == 1 && grid.dims()[2] == 2);
+
+  constexpr int im_nb = 0;
+  int           shift = m_grid_quantizer + 1;
+  int           H_in  = data.dims()[1];
+  int           W_in  = data.dims()[2];
+  int           H_out = grid.dims()[3];
+  int           W_out = grid.dims()[1];
+
+  for (int im_j = 0; im_j < W_out; im_j++)
+  {
+    for (int im_i = 0; im_i < H_out; im_i++)
+    {
+      // compute pixel locations from the grid
+      T2 x = grid(im_nb, im_j, 0, im_i);
+      T2 y = grid(im_nb, im_j, 1, im_i);
+      gs_denormalize(x, W_in);
+      gs_denormalize(y, H_in);
+
+      // compute the output Y using data values and pixel locations
+      T2   coeffs[16];
+      int  pos[16][2];
+
+      // calculate original pixel position (x, y) and adjacent pixel positions.
+      calc_bicubic_positions(y, x, H_in, W_in, pos);
+
+      // coeffs
+      get_bicubic_coeffs(y, x, coeffs);
+
+      BICUBIC_COUNTERS(data, coeffs);
+      bicubic_in_channels(data, coeffs, pos, shift, im_i, im_j, m_out);
+    }
+  }
+  return true;
+}
+
 template<> inline void GridSample<float>::gs_denormalize(float &x, int length)
 {
   if (m_mode == gridsample_mode_nearest)
   {
-    x = (x + 1) * (length - 1) / 2.0f;
+    if (m_align_corners)
+      x = (x + 1) * (length - 1) / 2.0f;
+    else
+      x = ((x + 1) * length - 1) / 2.0f;
     x = round(x);
   }
-  else if (m_mode == gridsample_mode_bilinear)
+  else if (m_mode == gridsample_mode_bilinear || m_mode == gridsample_mode_bicubic)
   {
-    x = (x + 1) * (length - 1) / 2.0f;
+    if (m_align_corners)
+      x = (x + 1) * (length - 1) / 2.0f;
+    else
+      x = ((x + 1) * length - 1) / 2.0f;
   }
 }
 
@@ -229,13 +281,19 @@ template<typename T> void GridSample<T>::gs_denormalize(T2 &x, int length)
   if (m_mode == gridsample_mode_nearest)
   {
     T2 normalize_bias = (T)(1 << shift);
-    x                 = (x + normalize_bias) * (length - 1);
+    if (m_align_corners)
+      x               = (x + normalize_bias) * (length - 1);
+    else
+      x               = (x + normalize_bias) * length - (T)(1 << shift);
     x                 = (x + normalize_bias) >> (shift + 1);   // round
   }
-  else if (m_mode == gridsample_mode_bilinear)
+  else if (m_mode == gridsample_mode_bilinear || m_mode == gridsample_mode_bicubic)
   {
     T2 normalize_bias = (T)(1 << shift);
-    x                 = (x + normalize_bias) * (length - 1);   // shift later to to maintain precision in higher-order bits during computation
+    if (m_align_corners)
+      x               = (x + normalize_bias) * (length - 1);
+    else
+      x               = (x + normalize_bias) * length - (T)(1 << shift);  // shift later to to maintain precision in higher-order bits during computation
   }
 }
 
@@ -287,5 +345,123 @@ template<typename T> void GridSample<T>::pixel_addr_at_grid(int H, int W, int &i
   }
 }
 
+template<> void GridSample<float>::calc_bicubic_positions(float y, float x, int H, int W, int pos[][2])
+{
+  float x_int = std::floor(x);
+  float y_int = std::floor(y);
+
+  // acquire the positions of 16 adjacent pixels
+  int idx = 0;
+  for (int dy = -1; dy <= 2; dy++)
+  {
+    for (int dx = -1; dx <= 2; dx++)
+    {
+      pos[idx][0] = std::min(std::max((int)(y_int + dy), 0), H - 1);
+      pos[idx][1] = std::min(std::max((int)(x_int + dx), 0), W - 1);
+      idx++;
+    }
+  }
+}
+
+template<typename T> void GridSample<T>::calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2])
+{
+  int  shift = m_grid_quantizer + 1;
+  T2 x_int = x >> shift;   // floor
+  T2 y_int = y >> shift;
+
+  // acquire the positions of 16 adjacent pixels
+  int idx = 0;
+  for (int dy = -1; dy <= 2; dy++)
+  {
+    for (int dx = -1; dx <= 2; dx++)
+    {
+      pos[idx][0] = std::min(std::max((int)(y_int + dy), 0), H - 1);
+      pos[idx][1] = std::min(std::max((int)(x_int + dx), 0), W - 1);
+      idx++;
+    }
+  }
+}
+
+template<> inline float GridSample<float>::cubic_coeffs(float x, int i)
+{
+  if (i == -1)
+    return (-3 * x * x * x + 6 * x * x - 3 * x) / 4;
+  else if (i == 0)
+    return (5 * x * x * x - 9 * x * x + 4) / 4;
+  else if (i == 1)
+    return (-5 * x * x * x + 6 * x * x + 3 * x) / 4;
+  else
+    return (3 * x * x * x - 3 * x * x) / 4;
+}
+
+template<typename T> inline typename GridSample<T>::T2 GridSample<T>::cubic_coeffs(T2 x, int i)
+{
+  int shift = m_grid_quantizer + 1;
+  T2 x2 = x * x;
+  ComputationType<T>::quantize(x2, shift);
+  T2 x3 = x2 * x;
+  ComputationType<T>::quantize(x3, shift);
+  if (i == -1)
+    return (-3 * x3 + 6 * x2 - 3 * x) >> 2;
+  else if (i == 0)
+    return (5 * x3 - 9 * x2 + (4 << shift)) >> 2;
+  else if (i == 1)
+    return (-5 * x3 + 6 * x2 + 3 * x) >> 2;
+  else
+    return (3 * x3 - 3 * x2) >> 2;
+}
+
+template<> inline void GridSample<float>::get_bicubic_coeffs(float y_ori, float x_ori, float coeffs[])
+{
+  float x_ori_int = std::floor(x_ori);
+  float y_ori_int = std::floor(y_ori);
+  float x_ratio   = (x_ori == x_ori_int) ? 1 : x_ori - x_ori_int;
+  float y_ratio   = (y_ori == y_ori_int) ? 1 : y_ori - y_ori_int;
+
+  float wx[4], wy[4];
+  for (int i = -1; i <= 2; i++)
+  {
+    wx[i + 1] = cubic_coeffs(x_ratio, i);
+    wy[i + 1] = cubic_coeffs(y_ratio, i);
+  }
+
+  int idx = 0;
+  for (int j = 0; j < 4; j++)
+  {
+    for (int i = 0; i < 4; i++)
+    {
+      coeffs[idx++] = wx[i] * wy[j];
+    }
+  }
+}
+
+template<typename T> void GridSample<T>::get_bicubic_coeffs(T2 y_ori, T2 x_ori, T2 coeffs[])
+{
+  int shift = m_grid_quantizer + 1;
+
+  T2 x_ori_int = (x_ori >> shift) << shift;
+  T2 y_ori_int = (y_ori >> shift) << shift;
+  T2 x_ratio   = (x_ori == x_ori_int) ? (T)(1 << shift) : x_ori - x_ori_int;
+  T2 y_ratio   = (y_ori == y_ori_int) ? (T)(1 << shift) : y_ori - y_ori_int;
+
+  T2 wx[4], wy[4];
+  for (int i = -1; i <= 2; i++)
+  {
+    wx[i + 1] = cubic_coeffs(x_ratio, i);
+    wy[i + 1] = cubic_coeffs(y_ratio, i);
+  }
+
+  int idx = 0;
+  for (int j = 0; j < 4; j++)
+  {
+    for (int i = 0; i < 4; i++)
+    {
+      coeffs[idx] = wx[i] * wy[j] + (T)(1 << (shift - 1));
+      ComputationType<T>::quantize(coeffs[idx], shift);
+      idx++;
+    }
+  }
+}
+
 }   // namespace layers
 }   // namespace sadl
diff --git a/utests/check.sh b/utests/check.sh
index 2b8a011..fd96b99 100755
--- a/utests/check.sh
+++ b/utests/check.sh
@@ -37,7 +37,7 @@ L="conv2d_4_8x8x4_k1x1s1,1_g1_p0,0 conv2d_4_8x8x4_k1x1s1,1_g4_p0,0 conv2d_4_8x8x
   conv2d_5x1_1x5_s1_g16 conv2d_7x1_1x7_s1_g16 \
   conv2d_16_4x4x1_k1x3s1,1_g1_p0,0 conv2d_16_4x4x1_k3x1s1,1_g1_p0,0 conv2d_16_4x4x1_k3x3s1,1_g1_p0,0 \
   conv2d_16_4x4x16_k3x3s1,1_g16_p0,0 conv2d_16_4x4x16_k1x3s1,1_g16_p0,0 conv2d_16_4x4x16_k3x1s1,1_g16_p0,0 \
-  conv2d_1_4x4x16_k3x3s1,1_g1_p0,0"
+  conv2d_1_4x4x16_k3x3s1,1_g1_p0,0 gridsample_bicubic"
 
 
 # mini_model_bb_mult 
diff --git a/utests/models/gridsample_bicubic.onnx b/utests/models/gridsample_bicubic.onnx
new file mode 100644
index 0000000..29c73f2
--- /dev/null
+++ b/utests/models/gridsample_bicubic.onnx
@@ -0,0 +1,26 @@
+pytorch1.13.1:รง
+u
+input_tensor
+grid2/GridSample"
+GridSample*
+
align_corners *
+mode"bicubic *
+padding_mode"border 	torch_jitZ&
+input_tensor
+
+
+
+
+Z
+grid
+
+
+
+
+b
+2
+
+
+
+
+B
\ No newline at end of file
-- 
GitLab


From 86fedd14ad72776053bc0d4905b10135478d3926 Mon Sep 17 00:00:00 2001
From: Nianxiang Fu <nianxiangfu@whu.edu.cn>
Date: Wed, 15 Jan 2025 20:43:41 +0800
Subject: [PATCH 2/3] Fix AVX2 logic in bicubic_in_channels and
 bilinear_in_channels functions

---
 sadl/interpolation_utils.h | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/sadl/interpolation_utils.h b/sadl/interpolation_utils.h
index fbb415e..2a8e3aa 100644
--- a/sadl/interpolation_utils.h
+++ b/sadl/interpolation_utils.h
@@ -100,19 +100,15 @@ void bilinear_in_channels(const Tensor<T> &data, const T2 coeffs[], const int po
 {
 #if __AVX2__
   int in_D = data.dims()[3];
-#if __AVX512F__
+#if __AVX512F__ || __AVX512BW__
   if (in_D % 16 == 0)   // same for float and int16
     bilinear_in_channels_simd512(data, coeffs, pos, shift, im_i, im_j, out);
-  else if (in_D % 8 == 0)   // same for float and int16
-    bilinear_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out);
   else
-    bilinear_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out);
-#else
-  if (in_D % 8 == 0)
+#endif
+  if (in_D % 8 == 0)    // same for float and int16
     bilinear_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out);
   else
     bilinear_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out);
-#endif
 #else
   bilinear_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out);
 #endif
@@ -360,19 +356,15 @@ void bicubic_in_channels(const Tensor<T> &data, const T2 coeffs[], const int pos
 {
 #if __AVX2__
   int in_D = data.dims()[3];
-#if __AVX512F__
+#if __AVX512F__ || __AVX512BW__
   if (in_D % 16 == 0)   // same for float and int16
     bicubic_in_channels_simd512(data, coeffs, pos, shift, im_i, im_j, out);
-  else if (in_D % 8 == 0)   // same for float and int16
-    bicubic_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out);
   else
-    bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out);
-#else
-  if (in_D % 8 == 0)
+#endif
+  if (in_D % 8 == 0)    // same for float and int16
     bicubic_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out);
   else
     bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out);
-#endif
 #else
   bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out);
 #endif
-- 
GitLab


From 2c31f7ff1e3c91160d6b0e35b617d4430b437797 Mon Sep 17 00:00:00 2001
From: Nianxiang Fu <nianxiangfu@whu.edu.cn>
Date: Mon, 20 Jan 2025 21:04:35 +0800
Subject: [PATCH 3/3] Make get_bicubic_coeffs, calc_bicubic_positions, and
 cubic_coeffs const

---
 sadl/layer_gridsample.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sadl/layer_gridsample.h b/sadl/layer_gridsample.h
index 2c03a8c..4847ba7 100644
--- a/sadl/layer_gridsample.h
+++ b/sadl/layer_gridsample.h
@@ -55,9 +55,9 @@ protected:
   bool         gridsample_bilinear(std::vector<Tensor<T> *> &in);
   bool         gridsample_bicubic(std::vector<Tensor<T> *> &in);
   void         get_bilinear_coeffs(T2 y, T2 x, int pos[], T2 coeffs[]);
-  void         get_bicubic_coeffs(T2 y, T2 x, T2 coeffs[]);
-  void         calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]);
-  T2           cubic_coeffs(T2 x, int i);
+  void         get_bicubic_coeffs(T2 y, T2 x, T2 coeffs[]) const;
+  void         calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]) const;
+  T2           cubic_coeffs(T2 x, int i) const;
   void         gs_denormalize(T2 &x, int length);
   void         pixel_addr_at_grid(int H, int W, int &i, int &j);
 
@@ -345,7 +345,7 @@ template<typename T> void GridSample<T>::pixel_addr_at_grid(int H, int W, int &i
   }
 }
 
-template<> void GridSample<float>::calc_bicubic_positions(float y, float x, int H, int W, int pos[][2])
+template<> void GridSample<float>::calc_bicubic_positions(float y, float x, int H, int W, int pos[][2]) const
 {
   float x_int = std::floor(x);
   float y_int = std::floor(y);
@@ -363,7 +363,7 @@ template<> void GridSample<float>::calc_bicubic_positions(float y, float x, int
   }
 }
 
-template<typename T> void GridSample<T>::calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2])
+template<typename T> void GridSample<T>::calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]) const
 {
   int  shift = m_grid_quantizer + 1;
   T2 x_int = x >> shift;   // floor
@@ -382,7 +382,7 @@ template<typename T> void GridSample<T>::calc_bicubic_positions(T2 y, T2 x, int
   }
 }
 
-template<> inline float GridSample<float>::cubic_coeffs(float x, int i)
+template<> inline float GridSample<float>::cubic_coeffs(float x, int i) const
 {
   if (i == -1)
     return (-3 * x * x * x + 6 * x * x - 3 * x) / 4;
@@ -394,7 +394,7 @@ template<> inline float GridSample<float>::cubic_coeffs(float x, int i)
     return (3 * x * x * x - 3 * x * x) / 4;
 }
 
-template<typename T> inline typename GridSample<T>::T2 GridSample<T>::cubic_coeffs(T2 x, int i)
+template<typename T> inline typename GridSample<T>::T2 GridSample<T>::cubic_coeffs(T2 x, int i) const
 {
   int shift = m_grid_quantizer + 1;
   T2 x2 = x * x;
@@ -411,7 +411,7 @@ template<typename T> inline typename GridSample<T>::T2 GridSample<T>::cubic_coef
     return (3 * x3 - 3 * x2) >> 2;
 }
 
-template<> inline void GridSample<float>::get_bicubic_coeffs(float y_ori, float x_ori, float coeffs[])
+template<> inline void GridSample<float>::get_bicubic_coeffs(float y_ori, float x_ori, float coeffs[]) const
 {
   float x_ori_int = std::floor(x_ori);
   float y_ori_int = std::floor(y_ori);
@@ -435,7 +435,7 @@ template<> inline void GridSample<float>::get_bicubic_coeffs(float y_ori, float
   }
 }
 
-template<typename T> void GridSample<T>::get_bicubic_coeffs(T2 y_ori, T2 x_ori, T2 coeffs[])
+template<typename T> void GridSample<T>::get_bicubic_coeffs(T2 y_ori, T2 x_ori, T2 coeffs[]) const
 {
   int shift = m_grid_quantizer + 1;
 
-- 
GitLab