vp4dpwssd

__m512i _mm512_4dpwssd_epi32 (_m512i src, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)

Synopsis

__m512i _mm512_4dpwssd_epi32 (_m512i src, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
#include <immintrin.h>
Instruction: vp4dpwssd zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW

Description

Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation, and store the results in dst.

Operation

FOR j := 0 to 15 FOR m := 0 to 3 lim_base := m*32 i := j*32 tl := b[lim_base+15:lim_base] tu := b[lim_base+31:lim_base+16] lword := a{m}[i+15:i] * tl uword := a{m}[i+31:i+16] * tu dst[i+31:i] := src[i+31:i] + lword + uword ENDFOR ENDFOR dst[MAX:512] := 0

vp4dpwssd

__m512i _mm512_mask_4dpwssd_epi32 (_m512i src, _mmask16 k, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)

Synopsis

__m128i _mm_abs_epi16 (__m128i a)

Synopsis

__m128i _mm_abs_epi16 (__m128i a)
#include <tmmintrin.h>
Instruction: pabsw xmm, xmm
CPUID Flags: SSSE3

Description

Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := ABS(a[i+15:i]) ENDFOR

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

pabsd

__m64 _mm_abs_pi32 (__m64 a)

Synopsis

__m64 _mm_abs_pi32 (__m64 a)
#include <tmmintrin.h>
Instruction: pabsd mm, mm
CPUID Flags: SSSE3

Description

Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 1 i := j*32 dst[i+31:i] := ABS(a[i+31:i]) ENDFOR

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

pabsb

__m64 _mm_abs_pi8 (__m64 a)

Synopsis

__m64 _mm_abs_pi8 (__m64 a)
#include <tmmintrin.h>
Instruction: pabsb mm, mm
CPUID Flags: SSSE3

Description

Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.

Operation

FOR j := 0 to 7 i := j*8 dst[i+7:i] := ABS(a[i+7:i]) ENDFOR

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.33
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.33
Broadwell	1	0.5
Haswell	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.33
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.33
Broadwell	1	0.5
Haswell	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.33
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.33
Broadwell	1	0.5
Haswell	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.33
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.33
Broadwell	1	0.5
Haswell	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	4	0.5
Broadwell	3	1
Haswell	3	1
Ivy Bridge	3	1

Performance

Architecture	Latency	Throughput (CPI)
Skylake	4	0.5
Broadwell	3	1
Haswell	3	1
Ivy Bridge	3	1

Performance

Architecture	Latency	Throughput (CPI)
Skylake	4	0.5
Broadwell	3	1
Haswell	3	1
Ivy Bridge	3	1

Performance

Architecture	Latency	Throughput (CPI)
Skylake	4	0.5
Broadwell	3	1
Haswell	3	1
Ivy Bridge	3	1

addsd

__m128d _mm_add_sd (__m128d a, __m128d b)

Synopsis

__m128d _mm_add_sd (__m128d a, __m128d b)
#include <emmintrin.h>
Instruction: addsd xmm, xmm
CPUID Flags: SSE2

Description

Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.

Operation

dst[63:0] := a[63:0] + b[63:0] dst[127:64] := a[127:64]

Performance

Architecture	Latency	Throughput (CPI)
Skylake	4	0.5
Broadwell	3	1
Haswell	3	1
Ivy Bridge	3	1

vaddsd

__m128d _mm_mask_add_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)

Operation

dst[31:0] := a[31:0] + b[31:0] dst[127:32] := a[127:32]

Performance

Architecture	Latency	Throughput (CPI)
Skylake	4	0.5
Broadwell	3	1
Haswell	3	1
Ivy Bridge	3	1

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	1
Broadwell	1	1

adcx, adox

unsigned char _addcarryx_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)

Synopsis

unsigned char _addcarryx_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
#include <immintrin.h>
Instruction: adcx r64, r64
adox r64, r64
CPUID Flags: ADX

Description

Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in (carry or overflow flag), and store the unsigned 64-bit result in out, and the carry-out in dst (carry or overflow flag).

Operation

out[63:0] := a[63:0] + b[63:0] + c_in dst := carry_out

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	1
Broadwell	1	1

vaddnpd

Add packed 16-bit integers in a and b using saturation, and store the results in dst.

Operation

FOR j := 0 to 7 i := j*16 dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] ) ENDFOR

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5

Performance

Architecture	Latency	Throughput (CPI)
Skylake	1	0.5
Broadwell	1	0.5
Haswell	1	0.5
Ivy Bridge	1	0.5

vpaddusw

__m128i _mm_mask_adds_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)

Synopsis

__m128i _mm_mask_adds_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW

Description

Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).

Operation

FOR j := 0 to 7 i := j*16 IF k[j] dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] ) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0