From 982df9c7a0cf741221e39d4765e3553cad68f472 Mon Sep 17 00:00:00 2001 From: Nianxiang Fu <nianxiangfu@whu.edu.cn> Date: Sat, 4 Jan 2025 17:28:09 +0800 Subject: [PATCH 1/3] Extend the GridSample operator to support Bicubic mode --- converter/main.py | 2 +- sadl/interpolation_utils.h | 252 ++++++++++++++++++++++++++ sadl/layer_gridsample.h | 204 +++++++++++++++++++-- utests/check.sh | 2 +- utests/models/gridsample_bicubic.onnx | 26 +++ 5 files changed, 470 insertions(+), 16 deletions(-) create mode 100644 utests/models/gridsample_bicubic.onnx diff --git a/converter/main.py b/converter/main.py index 09cf4ae..649535b 100644 --- a/converter/main.py +++ b/converter/main.py @@ -1245,7 +1245,7 @@ def parse_graph_node( mode = getAttribute(node, "mode").s.decode("utf-8") padding_mode = getAttribute(node, "padding_mode").s.decode("utf-8") - mode_list = ["nearest", "bilinear"] + mode_list = ["nearest", "bilinear", "bicubic"] if not mode in mode_list: quit("[ERROR] Currently, the mode of GridSample must in", mode_list, node) else: diff --git a/sadl/interpolation_utils.h b/sadl/interpolation_utils.h index 695f3df..fbb415e 100644 --- a/sadl/interpolation_utils.h +++ b/sadl/interpolation_utils.h @@ -62,6 +62,23 @@ namespace layers (void) coeffs #endif +#if DEBUG_COUNTERS +#define BICUBIC_COUNTERS(data, coeffs) \ + int C = data.dims()[3]; \ + for (int im_c = 0; im_c < C; im_c++) \ + { \ + for (int i = 0; i < 16; i ++) \ + { \ + COUNTERS_MAC(coeffs[i]); \ + } \ + COUNTERS(0); \ + } +#else +#define BICUBIC_COUNTERS(data, coeffs) \ + (void) data; \ + (void) coeffs +#endif + // //////////////////////////////////////////////////////////////////////////////////////////////////////// // bilinear interpolation utils // //////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -321,5 +338,240 @@ void nearest_in_channels(const Tensor<T> &data, const int im_i, const int im_j, out(im_nb, im_i, im_j, im_c) = data(im_nb, y, x, im_c); // same data type } } + +// //////////////////////////////////////////////////////////////////////////////////////////////////////// +// bicubic interpolation utils +// //////////////////////////////////////////////////////////////////////////////////////////////////////// +template<typename T, typename T2> +void bicubic_in_channels_wo_simd(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out); + +#if __AVX2__ +template<typename T, typename T2> +void bicubic_in_channels_simd256(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out); +#endif + +#if __AVX512F__ || __AVX512BW__ +template<typename T, typename T2> +void bicubic_in_channels_simd512(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out); +#endif + +template<typename T, typename T2> +void bicubic_in_channels(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out) +{ +#if __AVX2__ + int in_D = data.dims()[3]; +#if __AVX512F__ + if (in_D % 16 == 0) // same for float and int16 + bicubic_in_channels_simd512(data, coeffs, pos, shift, im_i, im_j, out); + else if (in_D % 8 == 0) // same for float and int16 + bicubic_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out); + else + bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out); +#else + if (in_D % 8 == 0) + bicubic_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out); + else + bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out); +#endif +#else + bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out); +#endif +} + +template<typename T, typename T2> +void bicubic_in_channels_wo_simd(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out) +{ + constexpr int im_nb = 0; + int in_D = data.dims()[3]; + + static std::vector<T2> temp_buffer; + temp_buffer.resize(in_D); + std::fill(temp_buffer.begin(), temp_buffer.end(), (T2) 0); + + for (int coeff_i = 0; coeff_i < 16; coeff_i++) + { + const T2 coeff = coeffs[coeff_i]; + const int pos_y = pos[coeff_i][0]; + const int pos_x = pos[coeff_i][1]; + for (int im_c = 0; im_c < in_D; im_c++) + { + temp_buffer[im_c] += coeff * data(im_nb, pos_y, pos_x, im_c); + } + } + for (int im_c = 0; im_c < in_D; im_c++) + { + T2 num = temp_buffer[im_c]; + ComputationType<T>::quantize(num, shift); + SATURATE(num); + out(im_nb, im_i, im_j, im_c) = static_cast<T>(num); + } +} + +#if __AVX2__ +template<> +inline void bicubic_in_channels_simd256(const Tensor<float> &data, const float coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, + Tensor<float> &out) +{ + constexpr int im_nb = 0; + int in_D = data.dims()[3]; + assert(in_D % 8 == 0); // Should be used with mod8 data. + + static std::vector<float> temp_buffer; + temp_buffer.resize(in_D); + std::fill(temp_buffer.begin(), temp_buffer.end(), 0.f); + + for (int coeff_i = 0; coeff_i < 16; coeff_i++) + { + const __m256 c0 = _mm256_set1_ps(coeffs[coeff_i]); + const float *dptr = data.addr(im_nb, pos[coeff_i][0], pos[coeff_i][1], 0); + const float *bptr = &temp_buffer[0]; + for (int im_c = 0; im_c < in_D; im_c += 8) + { + const __m256 d0 = _mm256_loadu_ps(dptr + im_c); + const __m256 b0 = _mm256_loadu_ps(bptr + im_c); +#if __FMA__ + const __m256 s = _mm256_fmadd_ps(c0, d0, b0); +#else + const __m256 s = _mm256_add_ps(_mm256_mul_ps(c0, d0), b0); +#endif + _mm256_storeu_ps((float *) (bptr + im_c), s); + } + } + for (int im_c = 0; im_c < in_D; im_c++) + { + out(im_nb, im_i, im_j, im_c) = temp_buffer[im_c]; + } +} + +template<> +inline void bicubic_in_channels_simd256(const Tensor<int16_t> &data, const int32_t coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, + Tensor<int16_t> &out) +{ + constexpr int im_nb = 0; + int in_D = data.dims()[3]; + assert(in_D % 8 == 0); // Should be used with mod8 data. +#if DEBUG_COUNTERS || SATURATE_RESULT + using T = int16_t; +#endif + + static std::vector<int32_t> temp_buffer; + temp_buffer.resize(in_D); + std::fill(temp_buffer.begin(), temp_buffer.end(), 0); + + for (int coeff_i = 0; coeff_i < 16; coeff_i++) + { + const __m256i c0 = _mm256_set1_epi32(coeffs[coeff_i]); + const int16_t *dptr = data.addr(im_nb, pos[coeff_i][0], pos[coeff_i][1], 0); + const int32_t *bptr = &temp_buffer[0]; + for (int im_c = 0; im_c < in_D; im_c += 8) + { + const __m256i d0 = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i *) (dptr + im_c))); + const __m256i b0 = _mm256_loadu_si256((__m256i *) (bptr + im_c)); + const __m256i s = _mm256_add_epi32(_mm256_mullo_epi32(c0, d0), b0); // res in int32 + _mm256_storeu_si256((__m256i *) (bptr + im_c), s); + } + } + for (int im_c = 0; im_c < in_D; im_c++) + { + int32_t num = temp_buffer[im_c]; + ComputationType<int16_t>::quantize(num, shift); + SATURATE(num); + out(im_nb, im_i, im_j, im_c) = num; + } +} + +template<typename T, typename T2> +void bicubic_in_channels_simd256(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out) +{ + std::cerr << "TODO " << std::endl; + exit(-1); +} +#endif + +#if __AVX512F__ +template<> +inline void bicubic_in_channels_simd512(const Tensor<float> &data, const float coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, + Tensor<float> &out) +{ + constexpr int im_nb = 0; + int in_D = data.dims()[3]; + assert(in_D % 16 == 0); // Should be used with mod16 data. + + static std::vector<float> temp_buffer; + temp_buffer.resize(in_D); + std::fill(temp_buffer.begin(), temp_buffer.end(), 0.f); + + for (int coeff_i = 0; coeff_i < 16; coeff_i++) + { + const __m512 c0 = _mm512_set1_ps(coeffs[coeff_i]); + const float *dptr = data.addr(im_nb, pos[coeff_i][0], pos[coeff_i][1], 0); + const float *bptr = &temp_buffer[0]; + for (int im_c = 0; im_c < in_D; im_c += 16) + { + const __m512 d0 = _mm512_loadu_ps(dptr + im_c); + const __m512 b0 = _mm512_loadu_ps(bptr + im_c); +#if __FMA__ + const __m512 s = _mm512_fmadd_ps(c0, d0, b0); +#else + const __m512 s = _mm512_add_ps(_mm512_mul_ps(c0, d0), b0); +#endif + _mm512_storeu_ps((float *) (bptr + im_c), s); + } + } + for (int im_c = 0; im_c < in_D; im_c++) + { + out(im_nb, im_i, im_j, im_c) = temp_buffer[im_c]; + } +} +#endif + +#if __AVX512BW__ +template<> +inline void bicubic_in_channels_simd512(const Tensor<int16_t> &data, const int32_t coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, + Tensor<int16_t> &out) +{ + constexpr int im_nb = 0; + int in_D = data.dims()[3]; + assert(in_D % 16 == 0); // Should be used with mod16 data. +#if DEBUG_COUNTERS || SATURATE_RESULT + using T = int16_t; +#endif + + static std::vector<int32_t> temp_buffer; + temp_buffer.resize(in_D); + std::fill(temp_buffer.begin(), temp_buffer.end(), 0); + + for (int coeff_i = 0; coeff_i < 16; coeff_i++) + { + const __m512i c0 = _mm512_set1_epi32(coeffs[coeff_i]); + const int16_t *dptr = data.addr(im_nb, pos[coeff_i][0], pos[coeff_i][1], 0); + const int32_t *bptr = &temp_buffer[0]; + for (int im_c = 0; im_c < in_D; im_c += 16) + { + const __m512i d0 = _mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i *) (dptr + im_c))); + const __m512i b0 = _mm512_loadu_si512(bptr + im_c); + const __m512i s = _mm512_add_epi32(_mm512_mullo_epi32(c0, d0), b0); // res in int32 + _mm512_storeu_si512((void *) (bptr + im_c), s); + } + } + for (int im_c = 0; im_c < in_D; im_c++) + { + int32_t num = temp_buffer[im_c]; + ComputationType<int16_t>::quantize(num, shift); + SATURATE(num); + out(im_nb, im_i, im_j, im_c) = num; + } +} +#endif + +#if __AVX512BW__ || __AVX512F__ +template<typename T, typename T2> +void bicubic_in_channels_simd512(const Tensor<T> &data, const T2 coeffs[], const int pos[][2], const int shift, const int im_i, const int im_j, Tensor<T> &out) +{ + std::cerr << "TODO " << std::endl; + exit(-1); +} +#endif + } // namespace layers } // namespace sadl diff --git a/sadl/layer_gridsample.h b/sadl/layer_gridsample.h index 4f368bb..2c03a8c 100644 --- a/sadl/layer_gridsample.h +++ b/sadl/layer_gridsample.h @@ -53,21 +53,26 @@ protected: virtual bool loadInternal(std::istream &file, Version) override; bool gridsample_nearest(std::vector<Tensor<T> *> &in); bool gridsample_bilinear(std::vector<Tensor<T> *> &in); + bool gridsample_bicubic(std::vector<Tensor<T> *> &in); void get_bilinear_coeffs(T2 y, T2 x, int pos[], T2 coeffs[]); + void get_bicubic_coeffs(T2 y, T2 x, T2 coeffs[]); + void calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]); + T2 cubic_coeffs(T2 x, int i); void gs_denormalize(T2 &x, int length); void pixel_addr_at_grid(int H, int W, int &i, int &j); enum gridsample_mode { gridsample_mode_nearest = 0, - gridsample_mode_bilinear = 1 + gridsample_mode_bilinear = 1, + gridsample_mode_bicubic = 2 }; enum gridsample_paddingmode { gridsample_paddingmode_border = 0 }; int m_align_corners; // 0: False, 1: True - int m_mode; // 0: "nearest", 1: "bilinear" + int m_mode; // 0: "nearest", 1: "bilinear", 2: "bicubic" int m_padding_mode; // 0: "border" int m_grid_quantizer; DUMP_MODEL_EXT; @@ -85,12 +90,7 @@ template<typename T> bool GridSample<T>::loadInternal(std::istream &file, Versio file.read((char *) &x, sizeof(x)); m_padding_mode = x; SADL_DBG(std::cout << " - padding_mode: " << m_padding_mode << std::endl); - if (m_align_corners != 1) - { - std::cerr << "[ERROR] invalid align corners: " << m_align_corners << std::endl; - return false; - } - if (m_mode != 0 && m_mode != 1) + if (m_mode != 0 && m_mode != 1 && m_mode != 2) { std::cerr << "[ERROR] invalid mode: " << m_mode << std::endl; return false; @@ -124,6 +124,8 @@ template<typename T> bool GridSample<T>::apply(std::vector<Tensor<T> *> &in) return gridsample_nearest(in); else if (m_mode == gridsample_mode_bilinear) return gridsample_bilinear(in); + else if (m_mode == gridsample_mode_bicubic) + return gridsample_bicubic(in); return false; } @@ -210,16 +212,66 @@ template<typename T> bool GridSample<T>::gridsample_bilinear(std::vector<Tensor< return true; } +template<typename T> bool GridSample<T>::gridsample_bicubic(std::vector<Tensor<T> *> &in) +{ + // Given an input data and a flow-field grid, computes the output Y using data + // values and pixel locations from the grid. + const Tensor<T> &data = *in[0]; // (1,H_in,W_in,C) + const Tensor<T> &grid = *in[1]; // (1,W_out,2,H_out) + m_out.quantizer = data.quantizer; + m_grid_quantizer = grid.quantizer; + assert(data.dims()[0] == 1 && grid.dims()[0] == 1 && grid.dims()[2] == 2); + + constexpr int im_nb = 0; + int shift = m_grid_quantizer + 1; + int H_in = data.dims()[1]; + int W_in = data.dims()[2]; + int H_out = grid.dims()[3]; + int W_out = grid.dims()[1]; + + for (int im_j = 0; im_j < W_out; im_j++) + { + for (int im_i = 0; im_i < H_out; im_i++) + { + // compute pixel locations from the grid + T2 x = grid(im_nb, im_j, 0, im_i); + T2 y = grid(im_nb, im_j, 1, im_i); + gs_denormalize(x, W_in); + gs_denormalize(y, H_in); + + // compute the output Y using data values and pixel locations + T2 coeffs[16]; + int pos[16][2]; + + // calculate original pixel position (x, y) and adjacent pixel positions. + calc_bicubic_positions(y, x, H_in, W_in, pos); + + // coeffs + get_bicubic_coeffs(y, x, coeffs); + + BICUBIC_COUNTERS(data, coeffs); + bicubic_in_channels(data, coeffs, pos, shift, im_i, im_j, m_out); + } + } + return true; +} + template<> inline void GridSample<float>::gs_denormalize(float &x, int length) { if (m_mode == gridsample_mode_nearest) { - x = (x + 1) * (length - 1) / 2.0f; + if (m_align_corners) + x = (x + 1) * (length - 1) / 2.0f; + else + x = ((x + 1) * length - 1) / 2.0f; x = round(x); } - else if (m_mode == gridsample_mode_bilinear) + else if (m_mode == gridsample_mode_bilinear || m_mode == gridsample_mode_bicubic) { - x = (x + 1) * (length - 1) / 2.0f; + if (m_align_corners) + x = (x + 1) * (length - 1) / 2.0f; + else + x = ((x + 1) * length - 1) / 2.0f; } } @@ -229,13 +281,19 @@ template<typename T> void GridSample<T>::gs_denormalize(T2 &x, int length) if (m_mode == gridsample_mode_nearest) { T2 normalize_bias = (T)(1 << shift); - x = (x + normalize_bias) * (length - 1); + if (m_align_corners) + x = (x + normalize_bias) * (length - 1); + else + x = (x + normalize_bias) * length - (T)(1 << shift); x = (x + normalize_bias) >> (shift + 1); // round } - else if (m_mode == gridsample_mode_bilinear) + else if (m_mode == gridsample_mode_bilinear || m_mode == gridsample_mode_bicubic) { T2 normalize_bias = (T)(1 << shift); - x = (x + normalize_bias) * (length - 1); // shift later to to maintain precision in higher-order bits during computation + if (m_align_corners) + x = (x + normalize_bias) * (length - 1); + else + x = (x + normalize_bias) * length - (T)(1 << shift); // shift later to to maintain precision in higher-order bits during computation } } @@ -287,5 +345,123 @@ template<typename T> void GridSample<T>::pixel_addr_at_grid(int H, int W, int &i } } +template<> void GridSample<float>::calc_bicubic_positions(float y, float x, int H, int W, int pos[][2]) +{ + float x_int = std::floor(x); + float y_int = std::floor(y); + + // acquire the positions of 16 adjacent pixels + int idx = 0; + for (int dy = -1; dy <= 2; dy++) + { + for (int dx = -1; dx <= 2; dx++) + { + pos[idx][0] = std::min(std::max((int)(y_int + dy), 0), H - 1); + pos[idx][1] = std::min(std::max((int)(x_int + dx), 0), W - 1); + idx++; + } + } +} + +template<typename T> void GridSample<T>::calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]) +{ + int shift = m_grid_quantizer + 1; + T2 x_int = x >> shift; // floor + T2 y_int = y >> shift; + + // acquire the positions of 16 adjacent pixels + int idx = 0; + for (int dy = -1; dy <= 2; dy++) + { + for (int dx = -1; dx <= 2; dx++) + { + pos[idx][0] = std::min(std::max((int)(y_int + dy), 0), H - 1); + pos[idx][1] = std::min(std::max((int)(x_int + dx), 0), W - 1); + idx++; + } + } +} + +template<> inline float GridSample<float>::cubic_coeffs(float x, int i) +{ + if (i == -1) + return (-3 * x * x * x + 6 * x * x - 3 * x) / 4; + else if (i == 0) + return (5 * x * x * x - 9 * x * x + 4) / 4; + else if (i == 1) + return (-5 * x * x * x + 6 * x * x + 3 * x) / 4; + else + return (3 * x * x * x - 3 * x * x) / 4; +} + +template<typename T> inline typename GridSample<T>::T2 GridSample<T>::cubic_coeffs(T2 x, int i) +{ + int shift = m_grid_quantizer + 1; + T2 x2 = x * x; + ComputationType<T>::quantize(x2, shift); + T2 x3 = x2 * x; + ComputationType<T>::quantize(x3, shift); + if (i == -1) + return (-3 * x3 + 6 * x2 - 3 * x) >> 2; + else if (i == 0) + return (5 * x3 - 9 * x2 + (4 << shift)) >> 2; + else if (i == 1) + return (-5 * x3 + 6 * x2 + 3 * x) >> 2; + else + return (3 * x3 - 3 * x2) >> 2; +} + +template<> inline void GridSample<float>::get_bicubic_coeffs(float y_ori, float x_ori, float coeffs[]) +{ + float x_ori_int = std::floor(x_ori); + float y_ori_int = std::floor(y_ori); + float x_ratio = (x_ori == x_ori_int) ? 1 : x_ori - x_ori_int; + float y_ratio = (y_ori == y_ori_int) ? 1 : y_ori - y_ori_int; + + float wx[4], wy[4]; + for (int i = -1; i <= 2; i++) + { + wx[i + 1] = cubic_coeffs(x_ratio, i); + wy[i + 1] = cubic_coeffs(y_ratio, i); + } + + int idx = 0; + for (int j = 0; j < 4; j++) + { + for (int i = 0; i < 4; i++) + { + coeffs[idx++] = wx[i] * wy[j]; + } + } +} + +template<typename T> void GridSample<T>::get_bicubic_coeffs(T2 y_ori, T2 x_ori, T2 coeffs[]) +{ + int shift = m_grid_quantizer + 1; + + T2 x_ori_int = (x_ori >> shift) << shift; + T2 y_ori_int = (y_ori >> shift) << shift; + T2 x_ratio = (x_ori == x_ori_int) ? (T)(1 << shift) : x_ori - x_ori_int; + T2 y_ratio = (y_ori == y_ori_int) ? (T)(1 << shift) : y_ori - y_ori_int; + + T2 wx[4], wy[4]; + for (int i = -1; i <= 2; i++) + { + wx[i + 1] = cubic_coeffs(x_ratio, i); + wy[i + 1] = cubic_coeffs(y_ratio, i); + } + + int idx = 0; + for (int j = 0; j < 4; j++) + { + for (int i = 0; i < 4; i++) + { + coeffs[idx] = wx[i] * wy[j] + (T)(1 << (shift - 1)); + ComputationType<T>::quantize(coeffs[idx], shift); + idx++; + } + } +} + } // namespace layers } // namespace sadl diff --git a/utests/check.sh b/utests/check.sh index 2b8a011..fd96b99 100755 --- a/utests/check.sh +++ b/utests/check.sh @@ -37,7 +37,7 @@ L="conv2d_4_8x8x4_k1x1s1,1_g1_p0,0 conv2d_4_8x8x4_k1x1s1,1_g4_p0,0 conv2d_4_8x8x conv2d_5x1_1x5_s1_g16 conv2d_7x1_1x7_s1_g16 \ conv2d_16_4x4x1_k1x3s1,1_g1_p0,0 conv2d_16_4x4x1_k3x1s1,1_g1_p0,0 conv2d_16_4x4x1_k3x3s1,1_g1_p0,0 \ conv2d_16_4x4x16_k3x3s1,1_g16_p0,0 conv2d_16_4x4x16_k1x3s1,1_g16_p0,0 conv2d_16_4x4x16_k3x1s1,1_g16_p0,0 \ - conv2d_1_4x4x16_k3x3s1,1_g1_p0,0" + conv2d_1_4x4x16_k3x3s1,1_g1_p0,0 gridsample_bicubic" # mini_model_bb_mult diff --git a/utests/models/gridsample_bicubic.onnx b/utests/models/gridsample_bicubic.onnx new file mode 100644 index 0000000..29c73f2 --- /dev/null +++ b/utests/models/gridsample_bicubic.onnx @@ -0,0 +1,26 @@ +pytorch1.13.1:รง +u +input_tensor +grid2/GridSample" +GridSample* + align_corners * +mode"bicubic * +padding_mode"border torch_jitZ& +input_tensor + + + + +Z +grid + + + + +b +2 + + + + +B \ No newline at end of file -- GitLab From 86fedd14ad72776053bc0d4905b10135478d3926 Mon Sep 17 00:00:00 2001 From: Nianxiang Fu <nianxiangfu@whu.edu.cn> Date: Wed, 15 Jan 2025 20:43:41 +0800 Subject: [PATCH 2/3] Fix AVX2 logic in bicubic_in_channels and bilinear_in_channels functions --- sadl/interpolation_utils.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/sadl/interpolation_utils.h b/sadl/interpolation_utils.h index fbb415e..2a8e3aa 100644 --- a/sadl/interpolation_utils.h +++ b/sadl/interpolation_utils.h @@ -100,19 +100,15 @@ void bilinear_in_channels(const Tensor<T> &data, const T2 coeffs[], const int po { #if __AVX2__ int in_D = data.dims()[3]; -#if __AVX512F__ +#if __AVX512F__ || __AVX512BW__ if (in_D % 16 == 0) // same for float and int16 bilinear_in_channels_simd512(data, coeffs, pos, shift, im_i, im_j, out); - else if (in_D % 8 == 0) // same for float and int16 - bilinear_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out); else - bilinear_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out); -#else - if (in_D % 8 == 0) +#endif + if (in_D % 8 == 0) // same for float and int16 bilinear_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out); else bilinear_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out); -#endif #else bilinear_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out); #endif @@ -360,19 +356,15 @@ void bicubic_in_channels(const Tensor<T> &data, const T2 coeffs[], const int pos { #if __AVX2__ int in_D = data.dims()[3]; -#if __AVX512F__ +#if __AVX512F__ || __AVX512BW__ if (in_D % 16 == 0) // same for float and int16 bicubic_in_channels_simd512(data, coeffs, pos, shift, im_i, im_j, out); - else if (in_D % 8 == 0) // same for float and int16 - bicubic_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out); else - bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out); -#else - if (in_D % 8 == 0) +#endif + if (in_D % 8 == 0) // same for float and int16 bicubic_in_channels_simd256(data, coeffs, pos, shift, im_i, im_j, out); else bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out); -#endif #else bicubic_in_channels_wo_simd(data, coeffs, pos, shift, im_i, im_j, out); #endif -- GitLab From 2c31f7ff1e3c91160d6b0e35b617d4430b437797 Mon Sep 17 00:00:00 2001 From: Nianxiang Fu <nianxiangfu@whu.edu.cn> Date: Mon, 20 Jan 2025 21:04:35 +0800 Subject: [PATCH 3/3] Make get_bicubic_coeffs, calc_bicubic_positions, and cubic_coeffs const --- sadl/layer_gridsample.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sadl/layer_gridsample.h b/sadl/layer_gridsample.h index 2c03a8c..4847ba7 100644 --- a/sadl/layer_gridsample.h +++ b/sadl/layer_gridsample.h @@ -55,9 +55,9 @@ protected: bool gridsample_bilinear(std::vector<Tensor<T> *> &in); bool gridsample_bicubic(std::vector<Tensor<T> *> &in); void get_bilinear_coeffs(T2 y, T2 x, int pos[], T2 coeffs[]); - void get_bicubic_coeffs(T2 y, T2 x, T2 coeffs[]); - void calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]); - T2 cubic_coeffs(T2 x, int i); + void get_bicubic_coeffs(T2 y, T2 x, T2 coeffs[]) const; + void calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]) const; + T2 cubic_coeffs(T2 x, int i) const; void gs_denormalize(T2 &x, int length); void pixel_addr_at_grid(int H, int W, int &i, int &j); @@ -345,7 +345,7 @@ template<typename T> void GridSample<T>::pixel_addr_at_grid(int H, int W, int &i } } -template<> void GridSample<float>::calc_bicubic_positions(float y, float x, int H, int W, int pos[][2]) +template<> void GridSample<float>::calc_bicubic_positions(float y, float x, int H, int W, int pos[][2]) const { float x_int = std::floor(x); float y_int = std::floor(y); @@ -363,7 +363,7 @@ template<> void GridSample<float>::calc_bicubic_positions(float y, float x, int } } -template<typename T> void GridSample<T>::calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]) +template<typename T> void GridSample<T>::calc_bicubic_positions(T2 y, T2 x, int H, int W, int pos[][2]) const { int shift = m_grid_quantizer + 1; T2 x_int = x >> shift; // floor @@ -382,7 +382,7 @@ template<typename T> void GridSample<T>::calc_bicubic_positions(T2 y, T2 x, int } } -template<> inline float GridSample<float>::cubic_coeffs(float x, int i) +template<> inline float GridSample<float>::cubic_coeffs(float x, int i) const { if (i == -1) return (-3 * x * x * x + 6 * x * x - 3 * x) / 4; @@ -394,7 +394,7 @@ template<> inline float GridSample<float>::cubic_coeffs(float x, int i) return (3 * x * x * x - 3 * x * x) / 4; } -template<typename T> inline typename GridSample<T>::T2 GridSample<T>::cubic_coeffs(T2 x, int i) +template<typename T> inline typename GridSample<T>::T2 GridSample<T>::cubic_coeffs(T2 x, int i) const { int shift = m_grid_quantizer + 1; T2 x2 = x * x; @@ -411,7 +411,7 @@ template<typename T> inline typename GridSample<T>::T2 GridSample<T>::cubic_coef return (3 * x3 - 3 * x2) >> 2; } -template<> inline void GridSample<float>::get_bicubic_coeffs(float y_ori, float x_ori, float coeffs[]) +template<> inline void GridSample<float>::get_bicubic_coeffs(float y_ori, float x_ori, float coeffs[]) const { float x_ori_int = std::floor(x_ori); float y_ori_int = std::floor(y_ori); @@ -435,7 +435,7 @@ template<> inline void GridSample<float>::get_bicubic_coeffs(float y_ori, float } } -template<typename T> void GridSample<T>::get_bicubic_coeffs(T2 y_ori, T2 x_ori, T2 coeffs[]) +template<typename T> void GridSample<T>::get_bicubic_coeffs(T2 y_ori, T2 x_ori, T2 coeffs[]) const { int shift = m_grid_quantizer + 1; -- GitLab