summaryrefslogtreecommitdiff
path: root/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h
diff options
context:
space:
mode:
author3gg <3gg@shellblade.net>2025-12-27 12:03:39 -0800
committer3gg <3gg@shellblade.net>2025-12-27 12:03:39 -0800
commit5a079a2d114f96d4847d1ee305d5b7c16eeec50e (patch)
tree8926ab44f168acf787d8e19608857b3af0f82758 /contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h
Initial commit
Diffstat (limited to 'contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h')
-rw-r--r--contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h372
1 files changed, 372 insertions, 0 deletions
diff --git a/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h
new file mode 100644
index 0000000..89d582a
--- /dev/null
+++ b/contrib/SDL-3.2.8/src/video/yuv2rgb/yuv_rgb_lsx_func.h
@@ -0,0 +1,372 @@
1// Copyright 2016 Adrien Descamps
2// // Distributed under BSD 3-Clause License
3
4#include <lsxintrin.h>
5
6#if YUV_FORMAT == YUV_FORMAT_420
7
8#define READ_Y(y_ptr) \
9 y = __lsx_vld(y_ptr, 0); \
10
11#define READ_UV \
12 u_temp = __lsx_vld(u_ptr, 0); \
13 v_temp = __lsx_vld(v_ptr, 0); \
14
15#else
16#error READ_UV unimplemented
17#endif
18
19#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, \
20 RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
21{ \
22 __m128i ab_l, ab_h, gr_l, gr_h; \
23 ab_l = __lsx_vilvl_b(B1, A1); \
24 ab_h = __lsx_vilvh_b(B1, A1); \
25 gr_l = __lsx_vilvl_b(R1, G1); \
26 gr_h = __lsx_vilvh_b(R1, G1); \
27 RGB1 = __lsx_vilvl_h(gr_l, ab_l); \
28 RGB2 = __lsx_vilvh_h(gr_l, ab_l); \
29 RGB3 = __lsx_vilvl_h(gr_h, ab_h); \
30 RGB4 = __lsx_vilvh_h(gr_h, ab_h); \
31 ab_l = __lsx_vilvl_b(B2, A2); \
32 ab_h = __lsx_vilvh_b(B2, A2); \
33 gr_l = __lsx_vilvl_b(R2, G2); \
34 gr_h = __lsx_vilvh_b(R2, G2); \
35 RGB5 = __lsx_vilvl_h(gr_l, ab_l); \
36 RGB6 = __lsx_vilvh_h(gr_l, ab_l); \
37 RGB7 = __lsx_vilvl_h(gr_h, ab_h); \
38 RGB8 = __lsx_vilvh_h(gr_h, ab_h); \
39}
40
41#define PACK_RGB24_32_STEP(R, G, B, RGB1, RGB2, RGB3) \
42 RGB1 = __lsx_vilvl_b(G, R); \
43 RGB1 = __lsx_vshuf_b(B, RGB1, mask1); \
44 RGB2 = __lsx_vshuf_b(B, G, mask2); \
45 RGB2 = __lsx_vshuf_b(R, RGB2, mask3); \
46 RGB3 = __lsx_vshuf_b(R, B, mask4); \
47 RGB3 = __lsx_vshuf_b(G, RGB3, mask5); \
48
49#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
50 PACK_RGB24_32_STEP(R1, G1, B1, RGB1, RGB2, RGB3); \
51 PACK_RGB24_32_STEP(R2, G2, B2, RGB4, RGB5, RGB6); \
52
53#if RGB_FORMAT == RGB_FORMAT_RGB24
54
55#define PACK_PIXEL \
56 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
57 __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
58 PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, \
59 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
60 PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, \
61 rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
62
63#elif RGB_FORMAT == RGB_FORMAT_RGBA
64
65#define PACK_PIXEL \
66 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
67 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
68 __m128i a = __lsx_vldi(0xFF); \
69 PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, \
70 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
71 PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, \
72 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
73
74#elif RGB_FORMAT == RGB_FORMAT_BGRA
75
76#define PACK_PIXEL \
77 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
78 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
79 __m128i a = __lsx_vldi(0xFF); \
80 PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, \
81 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
82 PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, \
83 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
84
85#elif RGB_FORMAT == RGB_FORMAT_ARGB
86
87#define PACK_PIXEL \
88 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
89 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
90 __m128i a = __lsx_vldi(0xFF); \
91 PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, \
92 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
93 PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, \
94 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
95
96#elif RGB_FORMAT == RGB_FORMAT_ABGR
97
98#define PACK_PIXEL \
99 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
100 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
101 __m128i a = __lsx_vldi(0xFF); \
102 PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, \
103 rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
104 PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, \
105 rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
106
107#else
108#error PACK_PIXEL unimplemented
109#endif
110
111#define LSX_ST_UB2(in0, in1, pdst, stride) \
112{ \
113 __lsx_vst(in0, pdst, 0); \
114 __lsx_vst(in1, pdst + stride, 0); \
115}
116
117#if RGB_FORMAT == RGB_FORMAT_RGB24 \
118
119#define SAVE_LINE1 \
120 LSX_ST_UB2(rgb_1, rgb_2, rgb_ptr1, 16); \
121 LSX_ST_UB2(rgb_3, rgb_4, rgb_ptr1 + 32, 16); \
122 LSX_ST_UB2(rgb_5, rgb_6, rgb_ptr1 + 64, 16); \
123
124#define SAVE_LINE2 \
125 LSX_ST_UB2(rgb_7, rgb_8, rgb_ptr2, 16); \
126 LSX_ST_UB2(rgb_9, rgb_10, rgb_ptr2 + 32, 16); \
127 LSX_ST_UB2(rgb_11, rgb_12, rgb_ptr2 + 64, 16); \
128
129#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
130 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR \
131
132#define SAVE_LINE1 \
133 LSX_ST_UB2(rgb_1, rgb_2, rgb_ptr1, 16); \
134 LSX_ST_UB2(rgb_3, rgb_4, rgb_ptr1 + 32, 16); \
135 LSX_ST_UB2(rgb_5, rgb_6, rgb_ptr1 + 64, 16); \
136 LSX_ST_UB2(rgb_7, rgb_8, rgb_ptr1 + 96, 16); \
137
138#define SAVE_LINE2 \
139 LSX_ST_UB2(rgb_9, rgb_10, rgb_ptr2, 16); \
140 LSX_ST_UB2(rgb_11, rgb_12, rgb_ptr2 + 32, 16); \
141 LSX_ST_UB2(rgb_13, rgb_14, rgb_ptr2 + 64, 16); \
142 LSX_ST_UB2(rgb_15, rgb_16, rgb_ptr2 + 96, 16); \
143
144#else
145#error SAVE_LINE unimplemented
146#endif
147
148// = u*vr g=u*ug+v*vg b=u*ub
149#define UV2RGB_16(U, V, R1, G1, B1, R2, G2, B2) \
150 r_temp = __lsx_vmul_h(V, v2r); \
151 g_temp = __lsx_vmul_h(U, u2g); \
152 g_temp = __lsx_vmadd_h(g_temp, V, v2g); \
153 b_temp = __lsx_vmul_h(U, u2b); \
154 R1 = __lsx_vilvl_h(r_temp, r_temp); \
155 G1 = __lsx_vilvl_h(g_temp, g_temp); \
156 B1 = __lsx_vilvl_h(b_temp, b_temp); \
157 R2 = __lsx_vilvh_h(r_temp, r_temp); \
158 G2 = __lsx_vilvh_h(g_temp, g_temp); \
159 B2 = __lsx_vilvh_h(b_temp, b_temp); \
160
161// Y=(Y-shift)*shift R=(Y+R)>>6,G=(Y+G)>>6,B=(B+Y)>>6
162#define ADD_Y2RGB_16(Y1, Y2, R1, G1, B1, R2, G2, B2) \
163 Y1 = __lsx_vsub_h(Y1, shift); \
164 Y2 = __lsx_vsub_h(Y2, shift); \
165 Y1 = __lsx_vmul_h(Y1, yf); \
166 Y2 = __lsx_vmul_h(Y2, yf); \
167 R1 = __lsx_vadd_h(R1, Y1); \
168 G1 = __lsx_vadd_h(G1, Y1); \
169 B1 = __lsx_vadd_h(B1, Y1); \
170 R2 = __lsx_vadd_h(R2, Y2); \
171 G2 = __lsx_vadd_h(G2, Y2); \
172 B2 = __lsx_vadd_h(B2, Y2); \
173 R1 = __lsx_vsrai_h(R1, PRECISION); \
174 G1 = __lsx_vsrai_h(G1, PRECISION); \
175 B1 = __lsx_vsrai_h(B1, PRECISION); \
176 R2 = __lsx_vsrai_h(R2, PRECISION); \
177 G2 = __lsx_vsrai_h(G2, PRECISION); \
178 B2 = __lsx_vsrai_h(B2, PRECISION); \
179
180#define CLIP(in0, in1, in2, in3, in4, in5) \
181{ \
182 in0 = __lsx_vmaxi_h(in0, 0); \
183 in1 = __lsx_vmaxi_h(in1, 0); \
184 in2 = __lsx_vmaxi_h(in2, 0); \
185 in3 = __lsx_vmaxi_h(in3, 0); \
186 in4 = __lsx_vmaxi_h(in4, 0); \
187 in5 = __lsx_vmaxi_h(in5, 0); \
188 in0 = __lsx_vsat_hu(in0, 7); \
189 in1 = __lsx_vsat_hu(in1, 7); \
190 in2 = __lsx_vsat_hu(in2, 7); \
191 in3 = __lsx_vsat_hu(in3, 7); \
192 in4 = __lsx_vsat_hu(in4, 7); \
193 in5 = __lsx_vsat_hu(in5, 7); \
194}
195
196#define YUV2RGB_32 \
197 __m128i y, u_temp, v_temp; \
198 __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
199 __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
200 __m128i u, v, r_temp, g_temp, b_temp; \
201 __m128i r_1, g_1, b_1, r_2, g_2, b_2; \
202 __m128i y_1, y_2; \
203 __m128i r_uv_1, g_uv_1, b_uv_1, r_uv_2, g_uv_2, b_uv_2; \
204 \
205 READ_UV \
206 \
207 /* process first 16 pixels of first line */ \
208 u = __lsx_vilvl_b(zero, u_temp); \
209 v = __lsx_vilvl_b(zero, v_temp); \
210 u = __lsx_vsub_h(u, bias); \
211 v = __lsx_vsub_h(v, bias); \
212 UV2RGB_16(u, v, r_1, g_1, b_1, r_2, g_2, b_2); \
213 r_uv_1 = r_1; g_uv_1 = g_1; b_uv_1 = b_1; \
214 r_uv_2 = r_2; g_uv_2 = g_2; b_uv_2 = b_2; \
215 READ_Y(y_ptr1) \
216 y_1 = __lsx_vilvl_b(zero, y); \
217 y_2 = __lsx_vilvh_b(zero, y); \
218 ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2) \
219 CLIP(r_1, g_1, b_1, r_2, g_2, b_2); \
220 r_8_11 = __lsx_vpickev_b(r_2, r_1); \
221 g_8_11 = __lsx_vpickev_b(g_2, g_1); \
222 b_8_11 = __lsx_vpickev_b(b_2, b_1); \
223 \
224 /* process first 16 pixels of second line */ \
225 r_1 = r_uv_1; g_1 = g_uv_1; b_1 = b_uv_1; \
226 r_2 = r_uv_2; g_2 = g_uv_2; b_2 = b_uv_2; \
227 \
228 READ_Y(y_ptr2) \
229 y_1 = __lsx_vilvl_b(zero, y); \
230 y_2 = __lsx_vilvh_b(zero, y); \
231 ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2) \
232 CLIP(r_1, g_1, b_1, r_2, g_2, b_2); \
233 r_8_21 = __lsx_vpickev_b(r_2, r_1); \
234 g_8_21 = __lsx_vpickev_b(g_2, g_1); \
235 b_8_21 = __lsx_vpickev_b(b_2, b_1); \
236 \
237 /* process last 16 pixels of first line */ \
238 u = __lsx_vilvh_b(zero, u_temp); \
239 v = __lsx_vilvh_b(zero, v_temp); \
240 u = __lsx_vsub_h(u, bias); \
241 v = __lsx_vsub_h(v, bias); \
242 UV2RGB_16(u, v, r_1, g_1, b_1, r_2, g_2, b_2); \
243 r_uv_1 = r_1; g_uv_1 = g_1; b_uv_1 = b_1; \
244 r_uv_2 = r_2; g_uv_2 = g_2; b_uv_2 = b_2; \
245 READ_Y(y_ptr1 + 16 * y_pixel_stride) \
246 y_1 = __lsx_vilvl_b(zero, y); \
247 y_2 = __lsx_vilvh_b(zero, y); \
248 ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2) \
249 CLIP(r_1, g_1, b_1, r_2, g_2, b_2); \
250 r_8_12 = __lsx_vpickev_b(r_2, r_1); \
251 g_8_12 = __lsx_vpickev_b(g_2, g_1); \
252 b_8_12 = __lsx_vpickev_b(b_2, b_1); \
253 \
254 /* process last 16 pixels of second line */ \
255 r_1 = r_uv_1; g_1 = g_uv_1; b_1 = b_uv_1; \
256 r_2 = r_uv_2; g_2 = g_uv_2; b_2 = b_uv_2; \
257 \
258 READ_Y(y_ptr2 + 16 * y_pixel_stride) \
259 y_1 = __lsx_vilvl_b(zero, y); \
260 y_2 = __lsx_vilvh_b(zero, y); \
261 ADD_Y2RGB_16(y_1, y_2, r_1, g_1, b_1, r_2, g_2, b_2) \
262 CLIP(r_1, g_1, b_1, r_2, g_2, b_2); \
263 r_8_22 = __lsx_vpickev_b(r_2, r_1); \
264 g_8_22 = __lsx_vpickev_b(g_2, g_1); \
265 b_8_22 = __lsx_vpickev_b(b_2, b_1); \
266 \
267
268void LSX_FUNCTION_NAME(uint32_t width, uint32_t height, const uint8_t *Y,
269 const uint8_t *U, const uint8_t *V, uint32_t Y_stride,
270 uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride,
271 YCbCrType yuv_type)
272{
273 const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
274#if YUV_FORMAT == YUV_FORMAT_420
275 const int y_pixel_stride = 1;
276 const int uv_pixel_stride = 1;
277 const int uv_x_sample_interval = 2;
278 const int uv_y_sample_interval = 2;
279#endif
280
281#if RGB_FORMAT == RGB_FORMAT_RGB565
282 const int rgb_pixel_stride = 2;
283#elif RGB_FORMAT == RGB_FORMAT_RGB24
284 const int rgb_pixel_stride = 3;
285 __m128i mask1 = {0x0504110302100100, 0x0A14090813070612};
286 __m128i mask2 = {0x1808170716061505, 0x00000000000A1909};
287 __m128i mask3 = {0x0504170302160100, 0x0A1A090819070618};
288 __m128i mask4 = {0x1E0D1D0C1C0B1B0A, 0x00000000000F1F0E};
289 __m128i mask5 = {0x05041C03021B0100, 0x0A1F09081E07061D};
290#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT_BGRA || \
291 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT_ABGR
292 const int rgb_pixel_stride = 4;
293#else
294#error Unknown RGB pixel size
295#endif
296
297 uint32_t xpos, ypos;
298 __m128i v2r = __lsx_vreplgr2vr_h(param->v_r_factor);
299 __m128i v2g = __lsx_vreplgr2vr_h(param->v_g_factor);
300 __m128i u2g = __lsx_vreplgr2vr_h(param->u_g_factor);
301 __m128i u2b = __lsx_vreplgr2vr_h(param->u_b_factor);
302 __m128i bias = __lsx_vreplgr2vr_h(128);
303 __m128i shift = __lsx_vreplgr2vr_h(param->y_shift);
304 __m128i yf = __lsx_vreplgr2vr_h(param->y_factor);
305 __m128i zero = __lsx_vldi(0);
306
307 if (width >= 32) {
308 for (ypos = 0; ypos < (height - (uv_y_sample_interval - 1)); ypos += uv_y_sample_interval) {
309 const uint8_t *y_ptr1 = Y + ypos * Y_stride,
310 *y_ptr2 = Y + (ypos + 1) * Y_stride,
311 *u_ptr = U + (ypos/uv_y_sample_interval) * UV_stride,
312 *v_ptr = V + (ypos/uv_y_sample_interval) * UV_stride;
313 uint8_t *rgb_ptr1 = RGB + ypos * RGB_stride,
314 *rgb_ptr2 = RGB + (ypos + 1) * RGB_stride;
315
316 for (xpos = 0; xpos < (width - 31); xpos += 32){
317 YUV2RGB_32
318 {
319 PACK_PIXEL
320 SAVE_LINE1
321 if (uv_y_sample_interval > 1)
322 {
323 SAVE_LINE2
324 }
325 }
326 y_ptr1 += 32 * y_pixel_stride;
327 y_ptr2 += 32 * y_pixel_stride;
328 u_ptr += 32 * uv_pixel_stride/uv_x_sample_interval;
329 v_ptr += 32 * uv_pixel_stride/uv_x_sample_interval;
330 rgb_ptr1 += 32 * rgb_pixel_stride;
331 rgb_ptr2 += 32 * rgb_pixel_stride;
332 }
333 }
334 if (uv_y_sample_interval == 2 && ypos == (height - 1)) {
335 const uint8_t *y_ptr = Y + ypos * Y_stride,
336 *u_ptr = U + (ypos/uv_y_sample_interval) * UV_stride,
337 *v_ptr = V + (ypos/uv_y_sample_interval) * UV_stride;
338 uint8_t *rgb_ptr = RGB + ypos * RGB_stride;
339 STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
340 }
341 }
342 {
343 int converted = (width & ~31);
344 if (converted != width)
345 {
346 const uint8_t *y_ptr = Y + converted * y_pixel_stride,
347 *u_ptr = U + converted * uv_pixel_stride / uv_x_sample_interval,
348 *v_ptr = V + converted * uv_pixel_stride / uv_x_sample_interval;
349 uint8_t *rgb_ptr = RGB + converted * rgb_pixel_stride;
350
351 STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
352 }
353 }
354}
355
356#undef LSX_FUNCTION_NAME
357#undef STD_FUNCTION_NAME
358#undef YUV_FORMAT
359#undef RGB_FORMAT
360#undef LSX_ALIGNED
361#undef LSX_ST_UB2
362#undef UV2RGB_16
363#undef ADD_Y2RGB_16
364#undef PACK_RGB24_32_STEP
365#undef PACK_RGB24_32
366#undef PACK_PIXEL
367#undef PACK_RGBA_32
368#undef SAVE_LINE1
369#undef SAVE_LINE2
370#undef READ_Y
371#undef READ_UV
372#undef YUV2RGB_32