231 float a_b, ma_b, a_mb, ma_mb;
249 const float *row1, *row2, *row3, *row4;
250 const float empty[4] = {0.0f, 0.0f, 0.0f, 0.0f};
259 else if (border && (x2 < 0 || x1 >= width)) {
268 else if (border && (y2 < 0 || y1 >= height)) {
274 if constexpr (border) {
275 row1 = (x1 < 0 || y1 < 0) ? empty : buffer + (
int64_t(width) * y1 + x1) * components;
276 row2 = (x1 < 0 || y2 > height - 1) ? empty : buffer + (
int64_t(width) * y2 + x1) * components;
277 row3 = (x2 > width - 1 || y1 < 0) ? empty : buffer + (
int64_t(width) * y1 + x2) * components;
278 row4 = (x2 > width - 1 || y2 > height - 1) ? empty :
279 buffer + (
int64_t(width) * y2 + x2) * components;
286 row1 = buffer + (
int64_t(width) * y1 + x1) * components;
287 row2 = buffer + (
int64_t(width) * y2 + x1) * components;
288 row3 = buffer + (
int64_t(width) * y1 + x2) * components;
289 row4 = buffer + (
int64_t(width) * y2 + x2) * components;
295 ma_b = (1.0f - a) *
b;
296 a_mb = a * (1.0f -
b);
297 ma_mb = (1.0f - a) * (1.0f -
b);
299 if (components == 1) {
300 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
302 else if (components == 3) {
303 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
304 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
305 output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
309 __m128 rgba1 = _mm_loadu_ps(row1);
310 __m128 rgba2 = _mm_loadu_ps(row2);
311 __m128 rgba3 = _mm_loadu_ps(row3);
312 __m128 rgba4 = _mm_loadu_ps(row4);
313 rgba1 = _mm_mul_ps(_mm_set1_ps(ma_mb), rgba1);
314 rgba2 = _mm_mul_ps(_mm_set1_ps(ma_b), rgba2);
315 rgba3 = _mm_mul_ps(_mm_set1_ps(a_mb), rgba3);
316 rgba4 = _mm_mul_ps(_mm_set1_ps(a_b), rgba4);
317 __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
318 __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
319 __m128 rgba = _mm_add_ps(rgba13, rgba24);
320 _mm_storeu_ps(output, rgba);
322 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
323 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
324 output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
325 output[3] = ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3];
337 __m128 uvuv = _mm_set_ps(
v, u,
v, u);
338 __m128 uvuv_floor = _mm_floor_ps(uvuv);
341 __m128i xy12 = _mm_add_epi32(_mm_cvttps_epi32(uvuv_floor), _mm_set_epi32(1, 1, 0, 0));
343 __m128i size_minus_1 = _mm_sub_epi32(_mm_set_epi32(height, width, height, width),
347 __m128i x1234, y1234, invalid_1234;
349 if constexpr (border) {
353 __m128i too_lo_xy12 = _mm_cmplt_epi32(xy12, _mm_setzero_si128());
354 __m128i too_hi_xy12 = _mm_cmplt_epi32(size_minus_1, xy12);
355 __m128i invalid_xy12 = _mm_or_si128(too_lo_xy12, too_hi_xy12);
358 x1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(2, 2, 0, 0));
359 y1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(3, 1, 3, 1));
360 invalid_1234 = _mm_or_si128(_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(2, 2, 0, 0)),
361 _mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(3, 1, 3, 1)));
363 x1234 = _mm_andnot_si128(invalid_1234, x1234);
364 y1234 = _mm_andnot_si128(invalid_1234, y1234);
368 __m128i xy12_clamped = _mm_max_epi32(xy12, _mm_setzero_si128());
369 xy12_clamped = _mm_min_epi32(xy12_clamped, size_minus_1);
370 x1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(2, 2, 0, 0));
371 y1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(3, 1, 3, 1));
378 _mm_storeu_ps((
float *)xcoord, _mm_castsi128_ps(x1234));
379 _mm_storeu_ps((
float *)ycoord, _mm_castsi128_ps(y1234));
380 int sample1 = ((
const int *)buffer)[ycoord[0] *
int64_t(width) + xcoord[0]];
381 int sample2 = ((
const int *)buffer)[ycoord[1] *
int64_t(width) + xcoord[1]];
382 int sample3 = ((
const int *)buffer)[ycoord[2] *
int64_t(width) + xcoord[2]];
383 int sample4 = ((
const int *)buffer)[ycoord[3] *
int64_t(width) + xcoord[3]];
384 __m128i samples1234 = _mm_set_epi32(sample4, sample3, sample2, sample1);
385 if constexpr (border) {
387 samples1234 = _mm_andnot_si128(invalid_1234, samples1234);
392 __m128i rgba16_12 = _mm_unpacklo_epi8(samples1234, _mm_setzero_si128());
393 __m128i rgba16_34 = _mm_unpackhi_epi8(samples1234, _mm_setzero_si128());
395 __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_12, _mm_setzero_si128()));
396 __m128 rgba2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_12, _mm_setzero_si128()));
397 __m128 rgba3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_34, _mm_setzero_si128()));
398 __m128 rgba4 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_34, _mm_setzero_si128()));
401 __m128 abab = _mm_sub_ps(uvuv, uvuv_floor);
402 __m128 m_abab = _mm_sub_ps(_mm_set1_ps(1.0f), abab);
403 __m128 ab_mab = _mm_shuffle_ps(abab, m_abab, _MM_SHUFFLE(3, 2, 1, 0));
404 __m128 factors = _mm_mul_ps(_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(0, 0, 2, 2)),
405 _mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(1, 3, 1, 3)));
408 rgba1 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(0, 0, 0, 0)), rgba1);
409 rgba2 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(1, 1, 1, 1)), rgba2);
410 rgba3 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(2, 2, 2, 2)), rgba3);
411 rgba4 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(3, 3, 3, 3)), rgba4);
412 __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
413 __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
414 __m128 rgba = _mm_add_ps(rgba13, rgba24);
415 rgba = _mm_add_ps(rgba, _mm_set1_ps(0.5f));
418 __m128i rgba32 = _mm_cvttps_epi32(rgba);
419 __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
420 __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
421 _mm_store_ss((
float *)&res, _mm_castsi128_ps(rgba8));
434 if (border && (x2 < 0 || x1 >= width || y2 < 0 || y1 >= height)) {
439 const uchar *row1, *row2, *row3, *row4;
440 uchar empty[4] = {0, 0, 0, 0};
441 if constexpr (border) {
442 row1 = (x1 < 0 || y1 < 0) ? empty : buffer + (
int64_t(width) * y1 + x1) * 4;
443 row2 = (x1 < 0 || y2 > height - 1) ? empty : buffer + (
int64_t(width) * y2 + x1) * 4;
444 row3 = (x2 > width - 1 || y1 < 0) ? empty : buffer + (
int64_t(width) * y1 + x2) * 4;
445 row4 = (x2 > width - 1 || y2 > height - 1) ? empty : buffer + (
int64_t(width) * y2 + x2) * 4;
452 row1 = buffer + (
int64_t(width) * y1 + x1) * 4;
453 row2 = buffer + (
int64_t(width) * y2 + x1) * 4;
454 row3 = buffer + (
int64_t(width) * y1 + x2) * 4;
455 row4 = buffer + (
int64_t(width) * y2 + x2) * 4;
461 float ma_b = (1.0f - a) *
b;
462 float a_mb = a * (1.0f -
b);
463 float ma_mb = (1.0f - a) * (1.0f -
b);
465 res.x =
uchar(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
466 res.y =
uchar(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
467 res.z =
uchar(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
468 res.w =
uchar(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
630 1.0f, 0.990965f, 0.982f, 0.973105f, 0.96428f, 0.955524f, 0.946836f,
631 0.938216f, 0.929664f, 0.921178f, 0.912759f, 0.904405f, 0.896117f, 0.887893f,
632 0.879734f, 0.871638f, 0.863605f, 0.855636f, 0.847728f, 0.839883f, 0.832098f,
633 0.824375f, 0.816712f, 0.809108f, 0.801564f, 0.794079f, 0.786653f, 0.779284f,
634 0.771974f, 0.76472f, 0.757523f, 0.750382f, 0.743297f, 0.736267f, 0.729292f,
635 0.722372f, 0.715505f, 0.708693f, 0.701933f, 0.695227f, 0.688572f, 0.68197f,
636 0.67542f, 0.66892f, 0.662471f, 0.656073f, 0.649725f, 0.643426f, 0.637176f,
637 0.630976f, 0.624824f, 0.618719f, 0.612663f, 0.606654f, 0.600691f, 0.594776f,
638 0.588906f, 0.583083f, 0.577305f, 0.571572f, 0.565883f, 0.56024f, 0.55464f,
639 0.549084f, 0.543572f, 0.538102f, 0.532676f, 0.527291f, 0.521949f, 0.516649f,
640 0.511389f, 0.506171f, 0.500994f, 0.495857f, 0.490761f, 0.485704f, 0.480687f,
641 0.475709f, 0.470769f, 0.465869f, 0.461006f, 0.456182f, 0.451395f, 0.446646f,
642 0.441934f, 0.437258f, 0.432619f, 0.428017f, 0.42345f, 0.418919f, 0.414424f,
643 0.409963f, 0.405538f, 0.401147f, 0.39679f, 0.392467f, 0.388178f, 0.383923f,
644 0.379701f, 0.375511f, 0.371355f, 0.367231f, 0.363139f, 0.359079f, 0.355051f,
645 0.351055f, 0.347089f, 0.343155f, 0.339251f, 0.335378f, 0.331535f, 0.327722f,
646 0.323939f, 0.320186f, 0.316461f, 0.312766f, 0.3091f, 0.305462f, 0.301853f,
647 0.298272f, 0.294719f, 0.291194f, 0.287696f, 0.284226f, 0.280782f, 0.277366f,
648 0.273976f, 0.270613f, 0.267276f, 0.263965f, 0.26068f, 0.257421f, 0.254187f,
649 0.250979f, 0.247795f, 0.244636f, 0.241502f, 0.238393f, 0.235308f, 0.232246f,
650 0.229209f, 0.226196f, 0.223206f, 0.220239f, 0.217296f, 0.214375f, 0.211478f,
651 0.208603f, 0.20575f, 0.20292f, 0.200112f, 0.197326f, 0.194562f, 0.191819f,
652 0.189097f, 0.186397f, 0.183718f, 0.18106f, 0.178423f, 0.175806f, 0.17321f,
653 0.170634f, 0.168078f, 0.165542f, 0.163026f, 0.16053f, 0.158053f, 0.155595f,
654 0.153157f, 0.150738f, 0.148337f, 0.145955f, 0.143592f, 0.141248f, 0.138921f,
655 0.136613f, 0.134323f, 0.132051f, 0.129797f, 0.12756f, 0.125341f, 0.123139f,
656 0.120954f, 0.118786f, 0.116635f, 0.114501f, 0.112384f, 0.110283f, 0.108199f,
657 0.106131f, 0.104079f, 0.102043f, 0.100023f, 0.0980186f, 0.09603f, 0.094057f,
658 0.0920994f, 0.0901571f, 0.08823f, 0.0863179f, 0.0844208f, 0.0825384f, 0.0806708f,
659 0.0788178f, 0.0769792f, 0.0751551f, 0.0733451f, 0.0715493f, 0.0697676f, 0.0679997f,
660 0.0662457f, 0.0645054f, 0.0627786f, 0.0610654f, 0.0593655f, 0.0576789f, 0.0560055f,
661 0.0543452f, 0.0526979f, 0.0510634f, 0.0494416f, 0.0478326f, 0.0462361f, 0.0446521f,
662 0.0430805f, 0.0415211f, 0.039974f, 0.0384389f, 0.0369158f, 0.0354046f, 0.0339052f,
663 0.0324175f, 0.0309415f, 0.029477f, 0.0280239f, 0.0265822f, 0.0251517f, 0.0237324f,
664 0.0223242f, 0.020927f, 0.0195408f, 0.0181653f, 0.0168006f, 0.0154466f, 0.0141031f,
665 0.0127701f, 0.0114476f, 0.0101354f, 0.00883339f, 0.00754159f, 0.00625989f, 0.00498819f,
666 0.00372644f, 0.00247454f, 0.00123242f, 0.0f,