298 BLI_assert(components > 0 && components <= 4);
301 float a_b, ma_b, a_mb, ma_mb;
319 const float *row1, *row2, *row3, *row4;
320 const float empty[4] = {0.0f, 0.0f, 0.0f, 0.0f};
348 row1 = buffer + (
int64_t(width) * y1c + x1c) * components;
349 row2 = buffer + (
int64_t(width) * y2c + x1c) * components;
350 row3 = buffer + (
int64_t(width) * y1c + x2c) * components;
351 row4 = buffer + (
int64_t(width) * y2c + x2c) * components;
358 if (x2 > width - 1) {
368 if (y2 > height - 1) {
378 ma_b = (1.0f - a) *
b;
379 a_mb = a * (1.0f -
b);
380 ma_mb = (1.0f - a) * (1.0f -
b);
382 if (components == 1) {
383 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
385 else if (components == 2) {
386 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
387 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
389 else if (components == 3) {
390 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
391 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
392 output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
396 __m128 rgba1 = _mm_loadu_ps(row1);
397 __m128 rgba2 = _mm_loadu_ps(row2);
398 __m128 rgba3 = _mm_loadu_ps(row3);
399 __m128 rgba4 = _mm_loadu_ps(row4);
400 rgba1 = _mm_mul_ps(_mm_set1_ps(ma_mb), rgba1);
401 rgba2 = _mm_mul_ps(_mm_set1_ps(ma_b), rgba2);
402 rgba3 = _mm_mul_ps(_mm_set1_ps(a_mb), rgba3);
403 rgba4 = _mm_mul_ps(_mm_set1_ps(a_b), rgba4);
404 __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
405 __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
406 __m128 rgba = _mm_add_ps(rgba13, rgba24);
407 _mm_storeu_ps(
output, rgba);
409 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
410 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
411 output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
412 output[3] = ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3];
424 __m128 uvuv = _mm_set_ps(
v, u,
v, u);
425 __m128 uvuv_floor = _mm_floor_ps(uvuv);
428 __m128i xy12 = _mm_add_epi32(_mm_cvttps_epi32(uvuv_floor), _mm_set_epi32(1, 1, 0, 0));
430 __m128i size_minus_1 = _mm_sub_epi32(_mm_set_epi32(height, width, height, width),
434 __m128i x1234, y1234, invalid_1234;
436 if constexpr (border) {
440 __m128i too_lo_xy12 = _mm_cmplt_epi32(xy12, _mm_setzero_si128());
441 __m128i too_hi_xy12 = _mm_cmplt_epi32(size_minus_1, xy12);
442 __m128i invalid_xy12 = _mm_or_si128(too_lo_xy12, too_hi_xy12);
445 x1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(2, 2, 0, 0));
446 y1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(3, 1, 3, 1));
447 invalid_1234 = _mm_or_si128(_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(2, 2, 0, 0)),
448 _mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(3, 1, 3, 1)));
450 x1234 = _mm_andnot_si128(invalid_1234, x1234);
451 y1234 = _mm_andnot_si128(invalid_1234, y1234);
455 __m128i xy12_clamped = _mm_max_epi32(xy12, _mm_setzero_si128());
456 xy12_clamped = _mm_min_epi32(xy12_clamped, size_minus_1);
457 x1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(2, 2, 0, 0));
458 y1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(3, 1, 3, 1));
465 _mm_storeu_ps((
float *)xcoord, _mm_castsi128_ps(x1234));
466 _mm_storeu_ps((
float *)ycoord, _mm_castsi128_ps(y1234));
467 int sample1 = ((
const int *)buffer)[ycoord[0] *
int64_t(width) + xcoord[0]];
468 int sample2 = ((
const int *)buffer)[ycoord[1] *
int64_t(width) + xcoord[1]];
469 int sample3 = ((
const int *)buffer)[ycoord[2] *
int64_t(width) + xcoord[2]];
470 int sample4 = ((
const int *)buffer)[ycoord[3] *
int64_t(width) + xcoord[3]];
471 __m128i samples1234 = _mm_set_epi32(sample4, sample3, sample2, sample1);
472 if constexpr (border) {
474 samples1234 = _mm_andnot_si128(invalid_1234, samples1234);
479 __m128i rgba16_12 = _mm_unpacklo_epi8(samples1234, _mm_setzero_si128());
480 __m128i rgba16_34 = _mm_unpackhi_epi8(samples1234, _mm_setzero_si128());
482 __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_12, _mm_setzero_si128()));
483 __m128 rgba2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_12, _mm_setzero_si128()));
484 __m128 rgba3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_34, _mm_setzero_si128()));
485 __m128 rgba4 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_34, _mm_setzero_si128()));
488 __m128 abab = _mm_sub_ps(uvuv, uvuv_floor);
489 __m128 m_abab = _mm_sub_ps(_mm_set1_ps(1.0f), abab);
490 __m128 ab_mab = _mm_shuffle_ps(abab, m_abab, _MM_SHUFFLE(3, 2, 1, 0));
491 __m128 factors = _mm_mul_ps(_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(0, 0, 2, 2)),
492 _mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(1, 3, 1, 3)));
495 rgba1 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(0, 0, 0, 0)), rgba1);
496 rgba2 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(1, 1, 1, 1)), rgba2);
497 rgba3 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(2, 2, 2, 2)), rgba3);
498 rgba4 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(3, 3, 3, 3)), rgba4);
499 __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
500 __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
501 __m128 rgba = _mm_add_ps(rgba13, rgba24);
502 rgba = _mm_add_ps(rgba, _mm_set1_ps(0.5f));
505 __m128i rgba32 = _mm_cvttps_epi32(rgba);
506 __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
507 __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
508 _mm_store_ss((
float *)&res, _mm_castsi128_ps(rgba8));
521 if (border && (x2 < 0 || x1 >= width || y2 < 0 || y1 >= height)) {
526 const uchar *row1, *row2, *row3, *row4;
527 uchar empty[4] = {0, 0, 0, 0};
528 if constexpr (border) {
529 row1 = (x1 < 0 || y1 < 0) ? empty : buffer + (
int64_t(width) * y1 + x1) * 4;
530 row2 = (x1 < 0 || y2 > height - 1) ? empty : buffer + (
int64_t(width) * y2 + x1) * 4;
531 row3 = (x2 > width - 1 || y1 < 0) ? empty : buffer + (
int64_t(width) * y1 + x2) * 4;
532 row4 = (x2 > width - 1 || y2 > height - 1) ? empty : buffer + (
int64_t(width) * y2 + x2) * 4;
539 row1 = buffer + (
int64_t(width) * y1 + x1) * 4;
540 row2 = buffer + (
int64_t(width) * y2 + x1) * 4;
541 row3 = buffer + (
int64_t(width) * y1 + x2) * 4;
542 row4 = buffer + (
int64_t(width) * y2 + x2) * 4;
548 float ma_b = (1.0f - a) *
b;
549 float a_mb = a * (1.0f -
b);
550 float ma_mb = (1.0f - a) * (1.0f -
b);
552 res.x =
uchar(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
553 res.y =
uchar(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
554 res.z =
uchar(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
555 res.w =
uchar(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
768 1.0f, 0.990965f, 0.982f, 0.973105f, 0.96428f, 0.955524f, 0.946836f,
769 0.938216f, 0.929664f, 0.921178f, 0.912759f, 0.904405f, 0.896117f, 0.887893f,
770 0.879734f, 0.871638f, 0.863605f, 0.855636f, 0.847728f, 0.839883f, 0.832098f,
771 0.824375f, 0.816712f, 0.809108f, 0.801564f, 0.794079f, 0.786653f, 0.779284f,
772 0.771974f, 0.76472f, 0.757523f, 0.750382f, 0.743297f, 0.736267f, 0.729292f,
773 0.722372f, 0.715505f, 0.708693f, 0.701933f, 0.695227f, 0.688572f, 0.68197f,
774 0.67542f, 0.66892f, 0.662471f, 0.656073f, 0.649725f, 0.643426f, 0.637176f,
775 0.630976f, 0.624824f, 0.618719f, 0.612663f, 0.606654f, 0.600691f, 0.594776f,
776 0.588906f, 0.583083f, 0.577305f, 0.571572f, 0.565883f, 0.56024f, 0.55464f,
777 0.549084f, 0.543572f, 0.538102f, 0.532676f, 0.527291f, 0.521949f, 0.516649f,
778 0.511389f, 0.506171f, 0.500994f, 0.495857f, 0.490761f, 0.485704f, 0.480687f,
779 0.475709f, 0.470769f, 0.465869f, 0.461006f, 0.456182f, 0.451395f, 0.446646f,
780 0.441934f, 0.437258f, 0.432619f, 0.428017f, 0.42345f, 0.418919f, 0.414424f,
781 0.409963f, 0.405538f, 0.401147f, 0.39679f, 0.392467f, 0.388178f, 0.383923f,
782 0.379701f, 0.375511f, 0.371355f, 0.367231f, 0.363139f, 0.359079f, 0.355051f,
783 0.351055f, 0.347089f, 0.343155f, 0.339251f, 0.335378f, 0.331535f, 0.327722f,
784 0.323939f, 0.320186f, 0.316461f, 0.312766f, 0.3091f, 0.305462f, 0.301853f,
785 0.298272f, 0.294719f, 0.291194f, 0.287696f, 0.284226f, 0.280782f, 0.277366f,
786 0.273976f, 0.270613f, 0.267276f, 0.263965f, 0.26068f, 0.257421f, 0.254187f,
787 0.250979f, 0.247795f, 0.244636f, 0.241502f, 0.238393f, 0.235308f, 0.232246f,
788 0.229209f, 0.226196f, 0.223206f, 0.220239f, 0.217296f, 0.214375f, 0.211478f,
789 0.208603f, 0.20575f, 0.20292f, 0.200112f, 0.197326f, 0.194562f, 0.191819f,
790 0.189097f, 0.186397f, 0.183718f, 0.18106f, 0.178423f, 0.175806f, 0.17321f,
791 0.170634f, 0.168078f, 0.165542f, 0.163026f, 0.16053f, 0.158053f, 0.155595f,
792 0.153157f, 0.150738f, 0.148337f, 0.145955f, 0.143592f, 0.141248f, 0.138921f,
793 0.136613f, 0.134323f, 0.132051f, 0.129797f, 0.12756f, 0.125341f, 0.123139f,
794 0.120954f, 0.118786f, 0.116635f, 0.114501f, 0.112384f, 0.110283f, 0.108199f,
795 0.106131f, 0.104079f, 0.102043f, 0.100023f, 0.0980186f, 0.09603f, 0.094057f,
796 0.0920994f, 0.0901571f, 0.08823f, 0.0863179f, 0.0844208f, 0.0825384f, 0.0806708f,
797 0.0788178f, 0.0769792f, 0.0751551f, 0.0733451f, 0.0715493f, 0.0697676f, 0.0679997f,
798 0.0662457f, 0.0645054f, 0.0627786f, 0.0610654f, 0.0593655f, 0.0576789f, 0.0560055f,
799 0.0543452f, 0.0526979f, 0.0510634f, 0.0494416f, 0.0478326f, 0.0462361f, 0.0446521f,
800 0.0430805f, 0.0415211f, 0.039974f, 0.0384389f, 0.0369158f, 0.0354046f, 0.0339052f,
801 0.0324175f, 0.0309415f, 0.029477f, 0.0280239f, 0.0265822f, 0.0251517f, 0.0237324f,
802 0.0223242f, 0.020927f, 0.0195408f, 0.0181653f, 0.0168006f, 0.0154466f, 0.0141031f,
803 0.0127701f, 0.0114476f, 0.0101354f, 0.00883339f, 0.00754159f, 0.00625989f, 0.00498819f,
804 0.00372644f, 0.00247454f, 0.00123242f, 0.0f,