Note
Access to this page requires authorization. You can try signing in or changing directories.
Access to this page requires authorization. You can try changing directories.
Microsoft Specific
Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pmaddubsw. This instruction multiplies and adds integers.
__m128i _mm_maddubs_epi16(
__m128i a,
__m128i b
);
Parameters
[in] a
A 128-bit parameter that contains sixteen 8-bit unsigned integers.[in] b
A 128-bit parameter that contains sixteen 8-bit signed integers.
Return value
A 128-bit result that contains eight 16-bit signed integers, where each result element represents the saturated sum of adjacent SIMD products. This can expressed with the following equations:
r0 := SATURATE_16((a0 * b0) + (a1 * b1))
r1 := SATURATE_16((a2 * b2) + (a3 * b3))
...
r7 := SATURATE_16((a14 * b14) + (a15 * b15))
Requirements
Intrinsic |
Architecture |
---|---|
_mm_maddubs_epi16 |
x86, x64 |
Header file <tmmintrin.h>
Remarks
r0-r7 are the sequentially ordered 16-bit components of return value r. r0 indicates the least significant 16 bits.
a0-a15 and b0-b15 are the sequentially ordered 8-bit components of parameters a and b, respectively. a0 and b0 are the least significant 8 bits. Parameter a contains unsigned bytes. Parameter b contains signed bytes.
SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x))
Before you use this intrinsic, software must ensure that the underlying processor supports the instruction.
Example
#include <stdio.h>
#include <tmmintrin.h>
int main ()
{
__m128i a, b, final;
int temp;
a.m128i_u8[0] = 1;
b.m128i_i8[0] = 32;
a.m128i_u8[1] = 1;
b.m128i_i8[1] = -32;
temp = (a.m128i_u8[0] * b.m128i_i8[0]) + (a.m128i_u8[1] * b.m128i_i8[1]);
final.m128i_i16[0] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;
a.m128i_u8[2] = 1;
b.m128i_i8[2] = 2;
a.m128i_u8[3] = 2;
b.m128i_i8[3] = 4;
temp = (a.m128i_u8[2] * b.m128i_i8[2]) + (a.m128i_u8[3] * b.m128i_i8[3]);
final.m128i_i16[1] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;
a.m128i_u8[4] = 10;
b.m128i_i8[4] = -128;
a.m128i_u8[5] = 12;
b.m128i_i8[5] = 12;
temp = (a.m128i_u8[4] * b.m128i_i8[4]) + (a.m128i_u8[5] * b.m128i_i8[5]);
final.m128i_i16[2] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;
a.m128i_u8[6] = 255;
b.m128i_i8[6] = -128;
a.m128i_u8[7] = 255;
b.m128i_i8[7] = -128;
temp = (a.m128i_u8[6] * b.m128i_i8[6]) + (a.m128i_u8[7] * b.m128i_i8[7]);
final.m128i_i16[3] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;
a.m128i_u8[8] = 0;
b.m128i_i8[8] = 100;
a.m128i_u8[9] = 20;
b.m128i_i8[9] = 20;
temp = (a.m128i_u8[8] * b.m128i_i8[8]) + (a.m128i_u8[9] * b.m128i_i8[9]);
final.m128i_i16[4] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;
a.m128i_u8[10] = 10;
b.m128i_i8[10] = 10;
a.m128i_u8[11] = 11;
b.m128i_i8[11] = 11;
temp = (a.m128i_u8[10] * b.m128i_i8[10]) + (a.m128i_u8[11] * b.m128i_i8[11]);
final.m128i_i16[5] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;
a.m128i_u8[12] = 12;
b.m128i_i8[12] = 12;
a.m128i_u8[13] = 13;
b.m128i_i8[13] = 13;
temp = (a.m128i_u8[12] * b.m128i_i8[12]) + (a.m128i_u8[13] * b.m128i_i8[13]);
final.m128i_i16[6] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;
a.m128i_u8[14] = 14;
b.m128i_i8[14] = 14;
a.m128i_u8[15] = 15;
b.m128i_i8[15] = 15;
temp = (a.m128i_u8[14] * b.m128i_i8[14]) + (a.m128i_u8[15] * b.m128i_i8[15]);
final.m128i_i16[7] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;
__m128i res = _mm_maddubs_epi16(a, b);
printf_s("Res0 should be %d: %d\nRes1 should be %d: %d\n",
final.m128i_i16[0], res.m128i_i16[0], final.m128i_i16[1], res.m128i_i16[1]);
printf_s("Res2 should be %d: %d\nRes3 should be %d: %d\n",
final.m128i_i16[2], res.m128i_i16[2], final.m128i_i16[3], res.m128i_i16[3]);
printf_s("Res4 should be %d: %d\nRes5 should be %d: %d\n",
final.m128i_i16[4], res.m128i_i16[4], final.m128i_i16[5], res.m128i_i16[5]);
printf_s("Res6 should be %d: %d\nRes7 should be %d: %d\n",
final.m128i_i16[6], res.m128i_i16[6], final.m128i_i16[7], res.m128i_i16[7]);
return 0;
}
Res0 should be 0: 0 Res1 should be 10: 10 Res2 should be -1136: -1136 Res3 should be -32768: -32768 Res4 should be 400: 400 Res5 should be 221: 221 Res6 should be 313: 313 Res7 should be 421: 421