Compute Library
 23.05
scale.cl
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "helpers.h"
25 #include "tile_helpers.h"
26 
27 /** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
28  *
29  * @param[in] coord 2D coordinates to transform.
30  * @param[in] scale input/output scale ratio
31  *
32  * @return a float8 containing 4 2D transformed values in the input image.
33  */
34 inline const float8 transform_nearest(const float2 coord, const float2 scale)
35 {
36 #ifdef SAMPLING_POLICY_TOP_LEFT
37  const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
38  const float4 new_x = in_x_coords * (float4)(scale.s0);
39  const float4 new_y = (float4)(coord.s1 * scale.s1);
40  return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
41 #elif SAMPLING_POLICY_CENTER
42  const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
43  const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
44  const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1);
45  return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
46 #else /* SAMPLING_POLICY */
47 #error("Unsupported sampling policy");
48 #endif /* SAMPLING_POLICY */
49 }
50 
51 /** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
52  *
53  * @param[in] coord 2D coordinates to transform.
54  * @param[in] scale input/output scale ratio
55  *
56  * @return a float8 containing 4 2D transformed values in the input image.
57  */
58 inline const float8 transform_bilinear(const float2 coord, const float2 scale)
59 {
60  const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
61 #ifdef SAMPLING_POLICY_TOP_LEFT
62  const float4 new_x = in_x_coords * (float4)(scale.s0);
63  const float4 new_y = (float4)(coord.s1 * scale.s1);
64  return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
65 #elif SAMPLING_POLICY_CENTER
66  const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
67  const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
68  return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
69 #else /* SAMPLING_POLICY */
70 #error("Unsupported sampling policy");
71 #endif /* SAMPLING_POLICY */
72 }
73 
74 /** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
75  *
76  * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
77  *
78  * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16.
79  * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
80  * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
81  * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
82  * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
83  * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
84  * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
85  * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
86  * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
87  * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
88  * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
89  * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
90  */
93  IMAGE_DECLARATION(out))
94 {
95  const int x = get_global_id(0);
96  const int y = get_global_id(1);
97 
98  float8 transformed = transform_nearest((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
99 #ifdef ALIGN_CORNERS
100  transformed = round(transformed);
101 #endif // ALIGN_CORNERS
102 
103  TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 4, cond);
104  cond[0].v = CONVERT(((transformed.even < 0) || (transformed.even >= (int)SRC_WIDTH)) || ((transformed.odd < 0) || (transformed.odd >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 4));
105 
106  TILE(int, 1, 4, in_x);
107  TILE(int, 1, 4, in_y);
108  in_x[0].v = convert_int4(clamp(transformed.even, 0.f, SRC_WIDTH - 1.f));
109  in_y[0].v = convert_int4(clamp(transformed.odd, 0.f, SRC_HEIGHT - 1.f));
110 
111  TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
112  LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
113  {
114  out_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
115  })
116 
117  __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
118 
119  if(x == get_global_size(0) - 1)
120  {
121 #if VEC_SIZE == 1
122  VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
123  (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
124 #else // VEC_SIZE == 1
125  VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
126  (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
127 #endif // VEC_SIZE == 1
128  }
129  else
130  {
131 #if VEC_SIZE == 1
132  VSTORE(VEC_SIZE)
133  (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
134 #else // VEC_SIZE == 1
135  VSTORE(VEC_SIZE)
136  (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
137 #endif // VEC_SIZE == 1
138  }
139 }
140 
141 /** Performs an affine transformation on an image interpolating with the BILINEAR method.
142  *
143  * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
144  *
145  * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16.
146  * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
147  * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
148  * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
149  * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
150  * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
151  * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
152  * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
153  * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
154  * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
155  * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
156  * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
157  */
158 __kernel void scale_bilinear_nchw(
159  IMAGE_DECLARATION(in),
160  IMAGE_DECLARATION(out))
161 {
162  const int x = get_global_id(0);
163  const int y = get_global_id(1);
164 
165  TILE(float, 1, 8, trans_coords);
166  TILE(float, 1, 8, floor_coords);
167  TILE(int, 1, 16, in_x);
168  TILE(int, 1, 16, in_y);
169 
170  trans_coords[0].v = transform_bilinear((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
171  floor_coords[0].v = floor(trans_coords[0].v);
172 
173  LOOP_UNROLLING(int, i, 0, 1, 4,
174  {
175  LOOP_UNROLLING(int, j, 0, 1, 4,
176  {
177  in_x[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 0] + (j % 2);
178  in_y[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 1] + (j > 1);
179  })
180  })
181 
182 #if defined(BORDER_MODE_CONSTANT)
183  TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 16, cond);
184  cond[0].v = CONVERT(((in_x[0].v < 0) || (in_x[0].v >= (int)SRC_WIDTH)) || ((in_y[0].v < 0) || (in_y[0].v >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 16));
185 #endif // defined(BORDER_MODE_CONSTANT)
186 
187  in_x[0].v = clamp(in_x[0].v, 0, (int16)((int)SRC_WIDTH - 1));
188  in_y[0].v = clamp(in_y[0].v, 0, (int16)((int)SRC_HEIGHT - 1));
189 
190  TILE(DATA_TYPE, 1, 16, in_vals);
191 
192  // Loads the values from the input image
193 #if defined(BORDER_MODE_CONSTANT)
194  LOOP_UNROLLING(int, i, 0, 1, 16,
195  {
196  in_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
197  })
198 #else // defined(BORDER_MODE_CONSTANT)
199  LOOP_UNROLLING(int, i, 0, 1, 16,
200  {
201  in_vals[0].s[i] = *((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y));
202  })
203 #endif // defined(BORDER_MODE_CONSTANT)
204 
205  TILE(float, 1, 8, a);
206  TILE(float, 1, 8, b);
207 
208  a[0].v = trans_coords[0].v - floor_coords[0].v;
209  b[0].v = ((float8)(1.f)) - a[0].v;
210 
211 #if defined(OFFSET) && defined(SCALE)
212  TILE(float, 1, 16, in_vals_f32);
213  TILE(float, 1, 4, out_vals_f32);
214 
215  in_vals_f32[0].v = convert_float16(convert_int16(in_vals[0].v) - (int16)OFFSET) * (float16)SCALE;
216 
217  // Bilinear interpolation: (in0 * b0 * b1) + (in1 * a0 * b1) + (in2 * b0 * a1) + (in3 * a0 * a1)
218  // (in4 * b2 * b3) + (in5 * a2 * b3) + (in6 * b2 * a3) + (in7 * a2 * a3)
219  // (in8 * b4 * b5) + (in9 * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
220  // (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
221  LOOP_UNROLLING(int, i, 0, 1, 4,
222  {
223  out_vals_f32[0].s[i] = (in_vals_f32[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
224  })
225 
226  TILE(DATA_TYPE, 1, 4, out_vals_4);
227  TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
228 
229  out_vals_4[0].v = CONVERT_SAT(convert_int4_sat_rtp(out_vals_f32[0].v / (float)SCALE) + OFFSET, VEC_DATA_TYPE(DATA_TYPE, 4));
230 
231  LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
232  {
233  out_vals[0].s[i] = out_vals_4[0].s[i];
234  })
235 #else // defined(OFFSET) && defined(SCALE)
236 
237  TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
238 
239  // Bilinear interpolation: (in0 * b0 * b1) + (in1 * a0 * b1) + (in2 * b0 * a1) + (in3 * a0 * a1)
240  // (in4 * b2 * b3) + (in5 * a2 * b3) + (in6 * b2 * a3) + (in7 * a2 * a3)
241  // (in8 * b4 * b5) + (in9 * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
242  // (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
243  LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
244  {
245  out_vals[0].s[i] = (in_vals[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
246  })
247 #endif // defined(OFFSET) && defined(SCALE)
248 
249  __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
250 
251  if(x == get_global_size(0) - 1)
252  {
253 #if VEC_SIZE == 1
254  VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
255  (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
256 #else // VEC_SIZE == 1
257  VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
258  (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
259 #endif // VEC_SIZE == 1
260  }
261  else
262  {
263 #if VEC_SIZE == 1
264  VSTORE(VEC_SIZE)
265  (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
266 #else // VEC_SIZE == 1
267  VSTORE(VEC_SIZE)
268  (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
269 #endif // VEC_SIZE == 1
270  }
271 }
#define VEC_SIZE
#define VSTORE_PARTIAL(size, store_size)
Definition: helpers.h:517
#define CONVERT(x, type)
Definition: helpers.h:757
SimpleTensor< float > b
Definition: DFT.cpp:157
#define LOOP_UNROLLING(type, idx, start, step, num, macro)
Definition: tile_helpers.h:340
#define IMAGE_DECLARATION(name)
Definition: helpers.h:831
const float8 transform_bilinear(const float2 coord, const float2 scale)
Transforms four 2D coordinates.
Definition: scale.cl:58
__kernel void scale_bilinear_nchw(__global uchar *in_ptr, uint in_stride_x, uint in_step_x, uint in_stride_y, uint in_step_y, uint in_offset_first_element_in_bytes, __global uchar *out_ptr, uint out_stride_x, uint out_step_x, uint out_stride_y, uint out_step_y, uint out_offset_first_element_in_bytes)
Performs an affine transformation on an image interpolating with the BILINEAR method.
Definition: scale.cl:158
const float8 transform_nearest(const float2 coord, const float2 scale)
Transforms four 2D coordinates.
Definition: scale.cl:34
DataType clamp(const DataType &n, const DataType &lower=std::numeric_limits< RangeType >::lowest(), const DataType &upper=std::numeric_limits< RangeType >::max())
Performs clamping among a lower and upper value.
Definition: Utility.h:102
#define SELECT_DATA_TYPE(type)
Definition: helpers.h:778
#define CONVERT_SAT(x, type)
Definition: helpers.h:760
__kernel void scale_nearest_neighbour_nchw(__global uchar *in_ptr, uint in_stride_x, uint in_step_x, uint in_stride_y, uint in_step_y, uint in_offset_first_element_in_bytes, __global uchar *out_ptr, uint out_stride_x, uint out_step_x, uint out_stride_y, uint out_step_y, uint out_offset_first_element_in_bytes)
Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method...
Definition: scale.cl:91
int round(float x, RoundingPolicy rounding_policy)
Return a rounded value of x.
Definition: Rounding.cpp:35
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
Definition: Select.cpp:38
#define SELECT_VEC_DATA_TYPE(type, size)
Definition: helpers.h:777
#define TILE(DATA_TYPE, H, W, BASENAME)
Tile object A tile object is a 2D memory block and can be accessed using the following syntax:a[m0]...
Definition: tile_helpers.h:74
#define VSTORE(size)
Definition: helpers.h:484
#define VEC_DATA_TYPE(type, size)
Definition: helpers.h:754