The Intel Intrinsics Guide is an interactive reference tool for
Intel intrinsic instructions, which are C style functions that provide
access to many Intel instructions - including Intel® SSE, AVX, AVX-512,
and more - without the need to write assembly code.
?
vp4dpwssd
__m512i _mm512_4dpwssd_epi32 (_m512i src, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
Synopsis
__m512i _mm512_4dpwssd_epi32 (_m512i src, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
#include <immintrin.h>
Instruction: vp4dpwssd zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
#include <immintrin.h>
Instruction: vp4dpwssd zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
Description
Compute
4 sequential operand source-block dot-products of two signed 16-bit
element operands with 32-bit element accumulation, and store the results
in dst.
Operation
FOR j := 0 to 15
FOR m := 0 to 3
lim_base := m*32
i := j*32
tl := b[lim_base+15:lim_base]
tu := b[lim_base+31:lim_base+16]
lword := a{m}[i+15:i] * tl
uword := a{m}[i+31:i+16] * tu
dst[i+31:i] := src[i+31:i] + lword + uword
ENDFOR
ENDFOR
dst[MAX:512] := 0
vp4dpwssd
__m512i _mm512_mask_4dpwssd_epi32 (_m512i src, _mmask16 k, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
Synopsis
__m512i _mm512_mask_4dpwssd_epi32 (_m512i src, _mmask16 k, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
#include <immintrin.h>
Instruction: vp4dpwssd zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
#include <immintrin.h>
Instruction: vp4dpwssd zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
Description
Compute
4 sequential operand source-block dot-products of two signed 16-bit
element operands with 32-bit element accumulation with mask, and store
the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
IF mask[j]
FOR m := 0 to 3
lim_base := m*32
i := j*32
tl := b[lim_base+15:lim_base]
tu := b[lim_base+31:lim_base+16]
lword := a{m}[i+15:i] * tl
uword := a{m}[i+31:i+16] * tu
dst[i+31:i] := src[i+31:i] + lword + uword
ENDFOR
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vp4dpwssd
__m512i _mm512_maskz_4dpwssd_epi32 (_mmask16 k, _m512i src, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
vp4dpwssds
__m512i _mm512_4dpwssds_epi32 (_m512i src, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
Synopsis
__m512i _mm512_4dpwssds_epi32 (_m512i src, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
#include <immintrin.h>
Instruction: vp4dpwssds zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
#include <immintrin.h>
Instruction: vp4dpwssds zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
Description
Compute
4 sequential operand source-block dot-products of two signed 16-bit
element operands with 32-bit element accumulation and signed saturation,
and store the results in dst.
Operation
FOR j := 0 to 15
FOR m := 0 to 3
lim_base := m*32
i := j*32
tl := b[lim_base+15:lim_base]
tu := b[lim_base+31:lim_base+16]
lword := a{m}[i+15:i] * tl
uword := a{m}[i+31:i+16] * tu
dst[i+31:i] := SIGNED_DWORD_SATURATE(src[i+31:i] + lword + uword)
ENDFOR
ENDFOR
dst[MAX:512] := 0
vp4dpwssds
__m512i _mm512_mask_4dpwssds_epi32 (_m512i src, _mmask16 k, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
Synopsis
__m512i _mm512_mask_4dpwssds_epi32 (_m512i src, _mmask16 k, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
#include <immintrin.h>
Instruction: vp4dpwssds zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
#include <immintrin.h>
Instruction: vp4dpwssds zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
Description
Compute
4 sequential operand source-block dot-products of two signed 16-bit
element operands with 32-bit element accumulation with mask and signed
saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set)..
Operation
FOR j := 0 to 15
IF mask[i]
FOR m := 0 to 3
lim_base := m*32
i := j*32
tl := b[lim_base+15:lim_base]
tu := b[lim_base+31:lim_base+16]
lword := a{m}[i+15:i] * tl
uword := a{m}[i+31:i+16] * tu
dst[i+31:i] := SIGNED_DWORD_SATURATE(src[i+31:i] + lword + uword)
ENDFOR
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vp4dpwssds
__m512i _mm512_maskz_4dpwssds_epi32 (_m512i src, _mmask16 k, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
Synopsis
__m512i _mm512_maskz_4dpwssds_epi32 (_m512i src, _mmask16 k, _m512i a0, _m512i a1, _m512i a2, _m512i a3, _m128i * b)
#include <immintrin.h>
Instruction: vp4dpwssds zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
#include <immintrin.h>
Instruction: vp4dpwssds zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4VNNIW
Description
Compute
4 sequential operand source-block dot-products of two signed 16-bit
element operands with 32-bit element accumulation with mask and signed
saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set)..
Operation
FOR j := 0 to 15
IF mask[i]
FOR m := 0 to 3
lim_base := m*32
i := j*32
tl := b[lim_base+15:lim_base]
tu := b[lim_base+31:lim_base+16]
lword := a{m}[i+15:i] * tl
uword := a{m}[i+31:i+16] * tu
dst[i+31:i] := SIGNED_DWORD_SATURATE(src[i+31:i] + lword + uword)
ENDFOR
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
v4fmaddps
__m512 _mm512_4fmadd_ps (_m512 a, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
Synopsis
__m512 _mm512_4fmadd_ps (_m512 a, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
#include <immintrin.h>
Instruction: v4fmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by the 4 corresponding packed elements in c, accumulating with the corresponding elements in a. Store the results in dst.
Operation
dst := a
FOR m := 0 to 3
FOR j := 0 to 15
i := j*32
n := m*32
dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] + b{m}[i+31:i] * c[n+31:n])
ENDFOR
ENDFOR
dst[MAX:512] := 0
v4fmaddps
__m512 _mm512_mask_4fmadd_ps (_m512 a, _mmask16 k, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
Synopsis
__m512 _mm512_mask_4fmadd_ps (_m512 a, _mmask16 k, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
#include <immintrin.h>
Instruction: v4fmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by the 4 corresponding packed elements in c, accumulating with the corresponding elements in a. Store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
dst := a
FOR m := 0 to 3
FOR j := 0 to 15
i := j*32
n := m*32
IF mask[j]
dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] + b{m}[i+31:i] * c[n+31:n])
FI
ENDFOR
ENDFOR
dst[MAX:512] := 0
v4fmaddps
__m512 _mm512_maskz_4fmadd_ps (_m512 a, _mmask16 k, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
Synopsis
__m512 _mm512_maskz_4fmadd_ps (_m512 a, _mmask16 k, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
#include <immintrin.h>
Instruction: v4fmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by the 4 corresponding packed elements in c, accumulating with the corresponding elements in a. Store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
dst := a
FOR m := 0 to 3
FOR j := 0 to 15
i := j*32
n := m*32
IF mask[j]
dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] + b{m}[i+31:i] * c[n+31:n])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
ENDFOR
dst[MAX:512] := 0
v4fmaddss
__m128 _mm_4fmadd_ss (__m128 a, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
Synopsis
__m128 _mm_4fmadd_ss (__m128 a, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
#include <immintrin.h>
Instruction: v4fmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by corresponding element in c, accumulating with the lower element in a. Store the result in the lower element of dst.
Operation
dst := a
FOR j := 0 to 3
i := j*32
dst[31:0] := RoundFPControl_MXCSR(dst[31:0] + b{j}[31:0] * c[i+31:i])
ENDFOR
dst[MAX:32] := 0
v4fmaddss
__m128 _mm_mask_4fmadd_ss (__m128 a, __mmask8 k, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
Synopsis
__m128 _mm_mask_4fmadd_ss (__m128 a, __mmask8 k, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
#include <immintrin.h>
Instruction: v4fmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by corresponding element in c, accumulating with the lower element in a. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set).
Operation
dst := a
IF k[0]
FOR j := 0 to 3
i := j*32
dst[31:0] := RoundFPControl_MXCSR(dst[31:0] + b{j}[31:0] * c[i+31:i])
ENDFOR
FI
dst[MAX:32] := 0
v4fmaddss
__m128 _mm_maskz_4fmadd_ss (__m128 a, __mmask8 k, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
Synopsis
__m128 _mm_maskz_4fmadd_ss (__m128 a, __mmask8 k, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
#include <immintrin.h>
Instruction: v4fmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by corresponding element in c, accumulating with the lower element in a. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
Operation
dst := a
IF k[0]
FOR j := 0 to 3
i := j*32
dst[31:0] := RoundFPControl_MXCSR(dst[31:0] + b{j}[31:0] * c[i+31:i])
ENDFOR
ELSE
dst[31:0] := 0
FI
dst[MAX:32] := 0
v4fnmaddps
__m512 _mm512_4fnmadd_ps (_m512 a, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
Synopsis
__m512 _mm512_4fnmadd_ps (_m512 a, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
#include <immintrin.h>
Instruction: v4fnmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fnmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by the 4 corresponding packed elements in c, accumulating the negated intermediate result with the corresponding elements in a. Store the results in dst.
Operation
dst := a
FOR m := 0 to 3
FOR j := 0 to 15
i := j*32
n := m*32
dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] - b{m}[i+31:i] * c[n+31:n])
ENDFOR
ENDFOR
dst[MAX:512] := 0
v4fnmaddps
__m512 _mm512_mask_4fnmadd_ps (_m512 a, _mmask16 k, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
Synopsis
__m512 _mm512_mask_4fnmadd_ps (_m512 a, _mmask16 k, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
#include <immintrin.h>
Instruction: v4fnmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fnmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by the 4 corresponding packed elements in c, accumulating the negated intermediate result with the corresponding elements in a. Store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
dst := a
FOR m := 0 to 3
FOR j := 0 to 15
i := j*32
n := m*32
IF mask[j]
dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] - b{m}[i+31:i] * c[n+31:n])
FI
ENDFOR
ENDFOR
dst[MAX:512] := 0
v4fnmaddps
__m512 _mm512_maskz_4fnmadd_ps (_m512 a, _mmask16 k, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
Synopsis
__m512 _mm512_maskz_4fnmadd_ps (_m512 a, _mmask16 k, _m512i b0, _m512i b1, _m512i b2, _m512i b3, _m128i * c)
#include <immintrin.h>
Instruction: v4fnmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fnmaddps zmm {k}, zmm+3, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by the 4 corresponding packed elements in c, accumulating the negated intermediate result with the corresponding elements in a. Store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
dst := a
FOR m := 0 to 3
FOR j := 0 to 15
i := j*32
n := m*32
IF mask[j]
dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] - b{m}[i+31:i] * c[n+31:n])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
ENDFOR
dst[MAX:512] := 0
v4fnmaddss
__m128 _mm_4fnmadd_ss (__m128 a, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
Synopsis
__m128 _mm_4fnmadd_ss (__m128 a, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
#include <immintrin.h>
Instruction: v4fnmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fnmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by corresponding element in c, accumulating the negated intermediate result with the lower element in a. Store the result in the lower element of dst.
Operation
dst := a
FOR j := 0 to 3
i := j*32
dst[31:0] := RoundFPControl_MXCSR(dst[31:0] - b{j}[31:0] * c[i+31:i])
ENDFOR
dst[MAX:32] := 0
v4fnmaddss
__m128 _mm_mask_4fnmadd_ss (__m128 a, __mmask8 k, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
Synopsis
__m128 _mm_mask_4fnmadd_ss (__m128 a, __mmask8 k, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
#include <immintrin.h>
Instruction: v4fnmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fnmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by corresponding element in c, accumulating the negated intermediate result with the lower element in a. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set).
Operation
dst := a
IF k[0]
FOR j := 0 to 3
i := j*32
dst[31:0] := RoundFPControl_MXCSR(dst[31:0] - b{j}[31:0] * c[i+31:i])
ENDFOR
FI
dst[MAX:32] := 0
v4fnmaddss
__m128 _mm_maskz_4fnmadd_ss (__m128 a, __mmask8 k, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
Synopsis
__m128 _mm_maskz_4fnmadd_ss (__m128 a, __mmask8 k, __m128 b0, __m128 b1, __m128 b2, __m128 b3, __m128 * c)
#include <immintrin.h>
Instruction: v4fnmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
#include <immintrin.h>
Instruction: v4fnmaddss xmm {k}, xmm, m128
CPUID Flags: AVX512_4FMAPS
Description
Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands b0 through b3 by corresponding element in c, accumulating the negated intermediate result with the lower element in a. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
Operation
dst := a
IF k[0]
FOR j := 0 to 3
i := j*32
dst[31:0] := RoundFPControl_MXCSR(dst[31:0] - b{j}[31:0] * c[i+31:i])
ENDFOR
ELSE
dst[31:0] := 0
FI
dst[MAX:32] := 0
pabsw
__m128i _mm_abs_epi16 (__m128i a)
Synopsis
__m128i _mm_abs_epi16 (__m128i a)
#include <tmmintrin.h>
Instruction: pabsw xmm, xmm
CPUID Flags: SSSE3
#include <tmmintrin.h>
Instruction: pabsw xmm, xmm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpabsw
__m128i _mm_mask_abs_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi16 (__m128i src, __mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpabsw
__m128i _mm_maskz_abs_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi16 (__mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpabsw
__m256i _mm256_abs_epi16 (__m256i a)
Synopsis
__m256i _mm256_abs_epi16 (__m256i a)
#include <immintrin.h>
Instruction: vpabsw ymm, ymm
CPUID Flags: AVX2
#include <immintrin.h>
Instruction: vpabsw ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:256] := 0
vpabsw
__m256i _mm256_mask_abs_epi16 (__m256i src, __mmask16 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi16 (__m256i src, __mmask16 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpabsw
__m256i _mm256_maskz_abs_epi16 (__mmask16 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi16 (__mmask16 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpabsw
__m512i _mm512_abs_epi16 (__m512i a)
Synopsis
__m512i _mm512_abs_epi16 (__m512i a)
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:512] := 0
vpabsw
__m512i _mm512_mask_abs_epi16 (__m512i src, __mmask32 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi16 (__m512i src, __mmask32 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpabsw
__m512i _mm512_maskz_abs_epi16 (__mmask32 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi16 (__mmask32 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpabsw
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pabsd
__m128i _mm_abs_epi32 (__m128i a)
Synopsis
__m128i _mm_abs_epi32 (__m128i a)
#include <tmmintrin.h>
Instruction: pabsd xmm, xmm
CPUID Flags: SSSE3
#include <tmmintrin.h>
Instruction: pabsd xmm, xmm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpabsd
__m128i _mm_mask_abs_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi32 (__m128i src, __mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpabsd
__m128i _mm_maskz_abs_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi32 (__mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpabsd
__m256i _mm256_abs_epi32 (__m256i a)
Synopsis
__m256i _mm256_abs_epi32 (__m256i a)
#include <immintrin.h>
Instruction: vpabsd ymm, ymm
CPUID Flags: AVX2
#include <immintrin.h>
Instruction: vpabsd ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vpabsd
__m256i _mm256_mask_abs_epi32 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi32 (__m256i src, __mmask8 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpabsd
__m256i _mm256_maskz_abs_epi32 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi32 (__mmask8 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpabsd
__m512i _mm512_abs_epi32 (__m512i a)
Synopsis
__m512i _mm512_abs_epi32 (__m512i a)
#include <immintrin.h>
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpabsd
__m512i _mm512_mask_abs_epi32 (__m512i src, __mmask16 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi32 (__m512i src, __mmask16 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpabsd
__m512i _mm512_maskz_abs_epi32 (__mmask16 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi32 (__mmask16 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpabsq
__m128i _mm_abs_epi64 (__m128i a)
Synopsis
__m128i _mm_abs_epi64 (__m128i a)
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpabsq
__m128i _mm_mask_abs_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi64 (__m128i src, __mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpabsq
__m128i _mm_maskz_abs_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi64 (__mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpabsq
__m256i _mm256_abs_epi64 (__m256i a)
Synopsis
__m256i _mm256_abs_epi64 (__m256i a)
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vpabsq
__m256i _mm256_mask_abs_epi64 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi64 (__m256i src, __mmask8 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpabsq
__m256i _mm256_maskz_abs_epi64 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi64 (__mmask8 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpabsq
__m512i _mm512_abs_epi64 (__m512i a)
Synopsis
__m512i _mm512_abs_epi64 (__m512i a)
#include <immintrin.h>
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vpabsq
__m512i _mm512_mask_abs_epi64 (__m512i src, __mmask8 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi64 (__m512i src, __mmask8 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpabsq
__m512i _mm512_maskz_abs_epi64 (__mmask8 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi64 (__mmask8 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pabsb
__m128i _mm_abs_epi8 (__m128i a)
Synopsis
__m128i _mm_abs_epi8 (__m128i a)
#include <tmmintrin.h>
Instruction: pabsb xmm, xmm
CPUID Flags: SSSE3
#include <tmmintrin.h>
Instruction: pabsb xmm, xmm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpabsb
__m128i _mm_mask_abs_epi8 (__m128i src, __mmask16 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi8 (__m128i src, __mmask16 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpabsb
__m128i _mm_maskz_abs_epi8 (__mmask16 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi8 (__mmask16 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpabsb
__m256i _mm256_abs_epi8 (__m256i a)
Synopsis
__m256i _mm256_abs_epi8 (__m256i a)
#include <immintrin.h>
Instruction: vpabsb ymm, ymm
CPUID Flags: AVX2
#include <immintrin.h>
Instruction: vpabsb ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:256] := 0
vpabsb
__m256i _mm256_mask_abs_epi8 (__m256i src, __mmask32 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi8 (__m256i src, __mmask32 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpabsb
__m256i _mm256_maskz_abs_epi8 (__mmask32 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi8 (__mmask32 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpabsb
__m512i _mm512_abs_epi8 (__m512i a)
Synopsis
__m512i _mm512_abs_epi8 (__m512i a)
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:512] := 0
vpabsb
__m512i _mm512_mask_abs_epi8 (__m512i src, __mmask64 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi8 (__m512i src, __mmask64 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpabsb
__m512i _mm512_maskz_abs_epi8 (__mmask64 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi8 (__mmask64 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpabsb
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpandq
__m512d _mm512_abs_pd (__m512d v2)
Synopsis
__m512d _mm512_abs_pd (__m512d v2)
#include <immintrin.h>
Instruction: vpandq zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vpandq zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ABS(v2[i+63:i])
ENDFOR
dst[MAX:512] := 0
vpandq
__m512d _mm512_mask_abs_pd (__m512d src, __mmask8 k, __m512d v2)
Synopsis
__m512d _mm512_mask_abs_pd (__m512d src, __mmask8 k, __m512d v2)
#include <immintrin.h>
Instruction: vpandq zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vpandq zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ABS(v2[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
pabsw
__m64 _mm_abs_pi16 (__m64 a)
Synopsis
__m64 _mm_abs_pi16 (__m64 a)
#include <tmmintrin.h>
Instruction: pabsw mm, mm
CPUID Flags: SSSE3
#include <tmmintrin.h>
Instruction: pabsw mm, mm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 3
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
pabsd
__m64 _mm_abs_pi32 (__m64 a)
Synopsis
__m64 _mm_abs_pi32 (__m64 a)
#include <tmmintrin.h>
Instruction: pabsd mm, mm
CPUID Flags: SSSE3
#include <tmmintrin.h>
Instruction: pabsd mm, mm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 1
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
pabsb
__m64 _mm_abs_pi8 (__m64 a)
Synopsis
__m64 _mm_abs_pi8 (__m64 a)
#include <tmmintrin.h>
Instruction: pabsb mm, mm
CPUID Flags: SSSE3
#include <tmmintrin.h>
Instruction: pabsb mm, mm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpandd
__m512 _mm512_abs_ps (__m512 v2)
Synopsis
__m512 _mm512_abs_ps (__m512 v2)
#include <immintrin.h>
Instruction: vpandd zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vpandd zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ABS(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpandd
__m512 _mm512_mask_abs_ps (__m512 src, __mmask16 k, __m512 v2)
Synopsis
__m512 _mm512_mask_abs_ps (__m512 src, __mmask16 k, __m512 v2)
#include <immintrin.h>
Instruction: vpandd zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vpandd zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ABS(v2[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_acos_pd (__m128d a)
Synopsis
__m128d _mm_acos_pd (__m128d a)
#include <immintrin.h>
CPUID Flags: SSE
#include <immintrin.h>
CPUID Flags: SSE
Description
Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_acos_pd (__m256d a)
Synopsis
__m256d _mm256_acos_pd (__m256d a)
#include <immintrin.h>
CPUID Flags: AVX
#include <immintrin.h>
CPUID Flags: AVX
Description
Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_acos_pd (__m512d a)
Synopsis
__m512d _mm512_acos_pd (__m512d a)
#include <immintrin.h>
CPUID Flags: AVX512F
#include <immintrin.h>
CPUID Flags: AVX512F
Description
Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_acos_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_acos_pd (__m512d src, __mmask8 k, __m512d a)
#include <immintrin.h>
CPUID Flags: AVX512F
#include <immintrin.h>
CPUID Flags: AVX512F
Description
Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ACOS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_acos_ps (__m128 a)
Synopsis
__m128 _mm_acos_ps (__m128 a)
#include <immintrin.h>
CPUID Flags: SSE
#include <immintrin.h>
CPUID Flags: SSE
Description
Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_acos_ps (__m256 a)
Synopsis
__m256 _mm256_acos_ps (__m256 a)
#include <immintrin.h>
CPUID Flags: AVX
#include <immintrin.h>
CPUID Flags: AVX
Description
Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_acos_ps (__m512 a)
Synopsis
__m512 _mm512_acos_ps (__m512 a)
#include <immintrin.h>
CPUID Flags: AVX512F
#include <immintrin.h>
CPUID Flags: AVX512F
Description
Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_acos_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_acos_ps (__m512 src, __mmask16 k, __m512 a)
#include <immintrin.h>
CPUID Flags: AVX512F
#include <immintrin.h>
CPUID Flags: AVX512F
Description
Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ACOS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_acosh_pd (__m128d a)
Synopsis
__m128d _mm_acosh_pd (__m128d a)
#include <immintrin.h>
CPUID Flags: SSE
#include <immintrin.h>
CPUID Flags: SSE
Description
Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_acosh_pd (__m256d a)
Synopsis
__m256d _mm256_acosh_pd (__m256d a)
#include <immintrin.h>
CPUID Flags: AVX
#include <immintrin.h>
CPUID Flags: AVX
Description
Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_acosh_pd (__m512d a)
Synopsis
__m512d _mm512_acosh_pd (__m512d a)
#include <immintrin.h>
CPUID Flags: AVX512F
#include <immintrin.h>
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_acosh_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_acosh_pd (__m512d src, __mmask8 k, __m512d a)
#include <immintrin.h>
CPUID Flags: AVX512F
#include <immintrin.h>
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ACOSH(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_acosh_ps (__m128 a)
Synopsis
__m128 _mm_acosh_ps (__m128 a)
#include <immintrin.h>
CPUID Flags: SSE
#include <immintrin.h>
CPUID Flags: SSE
Description
Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_acosh_ps (__m256 a)
Synopsis
__m256 _mm256_acosh_ps (__m256 a)
#include <immintrin.h>
CPUID Flags: AVX
#include <immintrin.h>
CPUID Flags: AVX
Description
Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_acosh_ps (__m512 a)
Synopsis
__m512 _mm512_acosh_ps (__m512 a)
#include <immintrin.h>
CPUID Flags: AVX512F
#include <immintrin.h>
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_acosh_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_acosh_ps (__m512 src, __mmask16 k, __m512 a)
#include <immintrin.h>
CPUID Flags: AVX512F
#include <immintrin.h>
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ACOSH(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpadcd
__m512i _mm512_adc_epi32 (__m512i v2, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
Synopsis
__m512i _mm512_adc_epi32 (__m512i v2, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
#include <immintrin.h>
Instruction: vpadcd zmm {k}, k, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vpadcd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition of packed 32-bit integers in v2 and v3 and the corresponding bit in k2, storing the result of the addition in dst and the result of the carry in k2_res.
Operation
FOR j := 0 to 15
i := j*32
k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
ENDFOR
dst[MAX:512] := 0
vpadcd
__m512i _mm512_mask_adc_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
Synopsis
__m512i _mm512_mask_adc_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
#include <immintrin.h>
Instruction: vpadcd zmm {k}, k, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vpadcd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition of packed 32-bit integers in v2 and v3 and the corresponding bit in k2, storing the result of the addition in dst and the result of the carry in k2_res using writemask k1 (elements are copied from v2 when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
ELSE
dst[i+31:i] := v2[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
paddw
__m128i _mm_add_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_add_epi16 (__m128i a, __m128i b)
#include <emmintrin.h>
Instruction: paddw xmm, xmm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: paddw xmm, xmm
CPUID Flags: SSE2
Description
Add packed 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.33 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpaddw
__m128i _mm_mask_add_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_add_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddw
__m128i _mm_maskz_add_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_add_epi16 (__mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddw
__m256i _mm256_add_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_add_epi16 (__m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddw ymm, ymm, ymm
CPUID Flags: AVX2
#include <immintrin.h>
Instruction: vpaddw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
dst[MAX:256] := 0
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.33 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
vpaddw
__m256i _mm256_mask_add_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_add_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddw
__m256i _mm256_maskz_add_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_add_epi16 (__mmask16 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddw
__m512i _mm512_add_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_add_epi16 (__m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
dst[MAX:512] := 0
vpaddw
__m512i _mm512_mask_add_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_add_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddw
__m512i _mm512_maskz_add_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_add_epi16 (__mmask32 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddd
__m128i _mm_add_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_add_epi32 (__m128i a, __m128i b)
#include <emmintrin.h>
Instruction: paddd xmm, xmm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: paddd xmm, xmm
CPUID Flags: SSE2
Description
Add packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.33 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpaddd
__m128i _mm_mask_add_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_add_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddd
__m128i _mm_maskz_add_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_add_epi32 (__mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddd
__m256i _mm256_add_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_add_epi32 (__m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddd ymm, ymm, ymm
CPUID Flags: AVX2
#include <immintrin.h>
Instruction: vpaddd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.33 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
vpaddd
__m256i _mm256_mask_add_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_add_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddd
__m256i _mm256_maskz_add_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_add_epi32 (__mmask8 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddd
__m512i _mm512_add_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_add_epi32 (__m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vpaddd
__m512i _mm512_mask_add_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_add_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddd
__m512i _mm512_maskz_add_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_add_epi32 (__mmask16 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddq
__m128i _mm_add_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_add_epi64 (__m128i a, __m128i b)
#include <emmintrin.h>
Instruction: paddq xmm, xmm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: paddq xmm, xmm
CPUID Flags: SSE2
Description
Add packed 64-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.33 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpaddq
__m128i _mm_mask_add_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_add_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddq
__m128i _mm_maskz_add_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_add_epi64 (__mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddq
__m256i _mm256_add_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_add_epi64 (__m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddq ymm, ymm, ymm
CPUID Flags: AVX2
#include <immintrin.h>
Instruction: vpaddq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 64-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.33 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
vpaddq
__m256i _mm256_mask_add_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_add_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddq
__m256i _mm256_maskz_add_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_add_epi64 (__mmask8 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
#include <immintrin.h>
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] :=0
FI
ENDFOR
dst[MAX:256] := 0
vpaddq
__m512i _mm512_add_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_add_epi64 (__m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vpaddq
__m512i _mm512_mask_add_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_add_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddq
__m512i _mm512_maskz_add_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_add_epi64 (__mmask8 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddb
__m128i _mm_add_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_add_epi8 (__m128i a, __m128i b)
#include <emmintrin.h>
Instruction: paddb xmm, xmm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: paddb xmm, xmm
CPUID Flags: SSE2
Description
Add packed 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.33 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpaddb
__m128i _mm_mask_add_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_add_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddb
__m128i _mm_maskz_add_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_add_epi8 (__mmask16 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddb
__m256i _mm256_add_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_add_epi8 (__m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddb ymm, ymm, ymm
CPUID Flags: AVX2
#include <immintrin.h>
Instruction: vpaddb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
dst[MAX:256] := 0
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.33 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
vpaddb
__m256i _mm256_mask_add_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_add_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddb
__m256i _mm256_maskz_add_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_add_epi8 (__mmask32 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddb
__m512i _mm512_add_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_add_epi8 (__m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
dst[MAX:512] := 0
vpaddb
__m512i _mm512_mask_add_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_add_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddb
__m512i _mm512_maskz_add_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_add_epi8 (__mmask64 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
addpd
__m128d _mm_add_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_add_pd (__m128d a, __m128d b)
#include <emmintrin.h>
Instruction: addpd xmm, xmm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: addpd xmm, xmm
CPUID Flags: SSE2
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 4 | 0.5 |
Broadwell | 3 | 1 |
Haswell | 3 | 1 |
Ivy Bridge | 3 | 1 |
vaddpd
__m128d _mm_mask_add_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_add_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include <immintrin.h>
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
#include <immintrin.h>
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vaddpd
__m128d _mm_maskz_add_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_add_pd (__mmask8 k, __m128d a, __m128d b)
#include <immintrin.h>
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
#include <immintrin.h>
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vaddpd
__m256d _mm256_add_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_add_pd (__m256d a, __m256d b)
#include <immintrin.h>
Instruction: vaddpd ymm, ymm, ymm
CPUID Flags: AVX
#include <immintrin.h>
Instruction: vaddpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 4 | 0.5 |
Broadwell | 3 | 1 |
Haswell | 3 | 1 |
Ivy Bridge | 3 | 1 |
vaddpd
__m256d _mm256_mask_add_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_add_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include <immintrin.h>
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
#include <immintrin.h>
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vaddpd
__m256d _mm256_maskz_add_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_add_pd (__mmask8 k, __m256d a, __m256d b)
#include <immintrin.h>
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
#include <immintrin.h>
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vaddpd
__m512d _mm512_add_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_add_pd (__m512d a, __m512d b)
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_mask_add_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_add_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_maskz_add_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_add_pd (__mmask8 k, __m512d a, __m512d b)
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddw
__m64 _mm_add_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_add_pi16 (__m64 a, __m64 b)
#include <mmintrin.h>
Instruction: paddw mm, mm
CPUID Flags: MMX
#include <mmintrin.h>
Instruction: paddw mm, mm
CPUID Flags: MMX
Description
Add packed 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*16
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
paddd
__m64 _mm_add_pi32 (__m64 a, __m64 b)
Synopsis
__m64 _mm_add_pi32 (__m64 a, __m64 b)
#include <mmintrin.h>
Instruction: paddd mm, mm
CPUID Flags: MMX
#include <mmintrin.h>
Instruction: paddd mm, mm
CPUID Flags: MMX
Description
Add packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
paddb
__m64 _mm_add_pi8 (__m64 a, __m64 b)
Synopsis
__m64 _mm_add_pi8 (__m64 a, __m64 b)
#include <mmintrin.h>
Instruction: paddb mm, mm
CPUID Flags: MMX
#include <mmintrin.h>
Instruction: paddb mm, mm
CPUID Flags: MMX
Description
Add packed 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*8
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
addps
__m128 _mm_add_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_add_ps (__m128 a, __m128 b)
#include <xmmintrin.h>
Instruction: addps xmm, xmm
CPUID Flags: SSE
#include <xmmintrin.h>
Instruction: addps xmm, xmm
CPUID Flags: SSE
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 4 | 0.5 |
Broadwell | 3 | 1 |
Haswell | 3 | 1 |
Ivy Bridge | 3 | 1 |
vaddps
__m128 _mm_mask_add_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_add_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include <immintrin.h>
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
#include <immintrin.h>
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vaddps
__m128 _mm_maskz_add_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_add_ps (__mmask8 k, __m128 a, __m128 b)
#include <immintrin.h>
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
#include <immintrin.h>
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vaddps
__m256 _mm256_add_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_add_ps (__m256 a, __m256 b)
#include <immintrin.h>
Instruction: vaddps ymm, ymm, ymm
CPUID Flags: AVX
#include <immintrin.h>
Instruction: vaddps ymm, ymm, ymm
CPUID Flags: AVX
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 4 | 0.5 |
Broadwell | 3 | 1 |
Haswell | 3 | 1 |
Ivy Bridge | 3 | 1 |
vaddps
__m256 _mm256_mask_add_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_add_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include <immintrin.h>
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
#include <immintrin.h>
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vaddps
__m256 _mm256_maskz_add_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_add_ps (__mmask8 k, __m256 a, __m256 b)
#include <immintrin.h>
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
#include <immintrin.h>
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vaddps
__m512 _mm512_add_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_add_ps (__m512 a, __m512 b)
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_mask_add_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_add_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_maskz_add_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_add_ps (__mmask16 k, __m512 a, __m512 b)
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_add_round_pd (__m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_add_round_pd (__m512d a, __m512d b, int rounding)
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_mask_add_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_mask_add_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_maskz_add_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_maskz_add_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_add_round_ps (__m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_add_round_ps (__m512 a, __m512 b, int rounding)
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_mask_add_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_mask_add_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_maskz_add_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_maskz_add_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vaddsd
__m128d _mm_add_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_add_round_sd (__m128d a, __m128d b, int rounding)
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := a[63:0] + b[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vaddsd
__m128d _mm_mask_add_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_add_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] + b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vaddsd
__m128d _mm_maskz_add_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_add_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] + b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vaddss
__m128 _mm_add_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_add_round_ss (__m128 a, __m128 b, int rounding)
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := a[31:0] + b[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vaddss
__m128 _mm_mask_add_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_add_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] + b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vaddss
__m128 _mm_maskz_add_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_add_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] + b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
addsd
__m128d _mm_add_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_add_sd (__m128d a, __m128d b)
#include <emmintrin.h>
Instruction: addsd xmm, xmm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: addsd xmm, xmm
CPUID Flags: SSE2
Description
Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := a[63:0] + b[63:0]
dst[127:64] := a[127:64]
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 4 | 0.5 |
Broadwell | 3 | 1 |
Haswell | 3 | 1 |
Ivy Bridge | 3 | 1 |
vaddsd
__m128d _mm_mask_add_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_add_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] + b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vaddsd
__m128d _mm_maskz_add_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_add_sd (__mmask8 k, __m128d a, __m128d b)
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] + b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
paddq
__m64 _mm_add_si64 (__m64 a, __m64 b)
Synopsis
__m64 _mm_add_si64 (__m64 a, __m64 b)
#include <emmintrin.h>
Instruction: paddq mm, mm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: paddq mm, mm
CPUID Flags: SSE2
Description
Add 64-bit integers a and b, and store the result in dst.
Operation
dst[63:0] := a[63:0] + b[63:0]
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.33 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
addss
__m128 _mm_add_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_add_ss (__m128 a, __m128 b)
#include <xmmintrin.h>
Instruction: addss xmm, xmm
CPUID Flags: SSE
#include <xmmintrin.h>
Instruction: addss xmm, xmm
CPUID Flags: SSE
Description
Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := a[31:0] + b[31:0]
dst[127:32] := a[127:32]
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 4 | 0.5 |
Broadwell | 3 | 1 |
Haswell | 3 | 1 |
Ivy Bridge | 3 | 1 |
vaddss
__m128 _mm_mask_add_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_add_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] + b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vaddss
__m128 _mm_maskz_add_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_add_ss (__mmask8 k, __m128 a, __m128 b)
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
#include <immintrin.h>
Instruction: vaddss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] + b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
adc
unsigned char _addcarry_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
Synopsis
unsigned char _addcarry_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
#include <immintrin.h>
Instruction: adc r32, r32
#include <immintrin.h>
Instruction: adc r32, r32
Description
Add unsigned 32-bit integers a and b with unsigned 8-bit carry-in c_in (carry flag), and store the unsigned 32-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
out[31:0] := a[31:0] + b[31:0] + c_in
dst := carry_out
adc
unsigned char _addcarry_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
Synopsis
unsigned char _addcarry_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
#include <immintrin.h>
Instruction: adc r64, r64
#include <immintrin.h>
Instruction: adc r64, r64
Description
Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in (carry flag), and store the unsigned 64-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
out[63:0] := a[63:0] + b[63:0] + c_in
dst := carry_out
adcx, adox
unsigned char _addcarryx_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
Synopsis
unsigned char _addcarryx_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
#include <immintrin.h>
Instruction: adcx r32, r32
adox r32, r32
CPUID Flags: ADX
#include <immintrin.h>
Instruction: adcx r32, r32
adox r32, r32
CPUID Flags: ADX
Description
Add unsigned 32-bit integers a and b with unsigned 8-bit carry-in c_in (carry or overflow flag), and store the unsigned 32-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
out[31:0] := a[31:0] + b[31:0] + c_in
dst := carry_out
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 1 |
Broadwell | 1 | 1 |
adcx, adox
unsigned char _addcarryx_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
Synopsis
unsigned char _addcarryx_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
#include <immintrin.h>
Instruction: adcx r64, r64
adox r64, r64
CPUID Flags: ADX
#include <immintrin.h>
Instruction: adcx r64, r64
adox r64, r64
CPUID Flags: ADX
Description
Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in (carry or overflow flag), and store the unsigned 64-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
out[63:0] := a[63:0] + b[63:0] + c_in
dst := carry_out
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 1 |
Broadwell | 1 | 1 |
vaddnpd
__m512d _mm512_addn_pd (__m512d v2, __m512d v3)
Synopsis
__m512d _mm512_addn_pd (__m512d v2, __m512d v3)
#include <immintrin.h>
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
ENDFOR
dst[MAX:512] := 0
vaddnpd
__m512d _mm512_mask_addn_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)
Synopsis
__m512d _mm512_mask_addn_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)
#include <immintrin.h>
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddnps
__m512 _mm512_addn_ps (__m512 v2, __m512 v3)
Synopsis
__m512 _mm512_addn_ps (__m512 v2, __m512 v3)
#include <immintrin.h>
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
ENDFOR
dst[MAX:512] := 0
vaddnps
__m512 _mm512_mask_addn_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)
Synopsis
__m512 _mm512_mask_addn_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)
#include <immintrin.h>
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddnpd
__m512d _mm512_addn_round_pd (__m512d v2, __m512d v3, int rounding)
Synopsis
__m512d _mm512_addn_round_pd (__m512d v2, __m512d v3, int rounding)
#include <immintrin.h>
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element by element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates the sum, storing the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
ENDFOR
dst[MAX:512] := 0
vaddnpd
__m512d _mm512_mask_addn_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)
Synopsis
__m512d _mm512_mask_addn_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)
#include <immintrin.h>
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element by element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates the sum, storing the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddnps
__m512 _mm512_addn_round_ps (__m512 v2, __m512 v3, int rounding)
Synopsis
__m512 _mm512_addn_round_ps (__m512 v2, __m512 v3, int rounding)
#include <immintrin.h>
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element by element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates the sum, storing the result in dst.
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
ENDFOR
dst[MAX:512] := 0
vaddnps
__m512 _mm512_mask_addn_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)
Synopsis
__m512 _mm512_mask_addn_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)
#include <immintrin.h>
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
#include <immintrin.h>
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element by element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates the sum, storing the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Rounding is done according to the rounding parameter, which can be one of:
Rounding is done according to the rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
paddsw
__m128i _mm_adds_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_adds_epi16 (__m128i a, __m128i b)
#include <emmintrin.h>
Instruction: paddsw xmm, xmm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: paddsw xmm, xmm
CPUID Flags: SSE2
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpaddsw
__m128i _mm_mask_adds_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_adds_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddsw
__m128i _mm_maskz_adds_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_adds_epi16 (__mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddsw
__m256i _mm256_adds_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_adds_epi16 (__m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddsw ymm, ymm, ymm
CPUID Flags: AVX2
#include <immintrin.h>
Instruction: vpaddsw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:256] := 0
vpaddsw
__m256i _mm256_mask_adds_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_adds_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddsw
__m256i _mm256_maskz_adds_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_adds_epi16 (__mmask16 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddsw
__m512i _mm512_adds_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_adds_epi16 (__m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:512] := 0
vpaddsw
__m512i _mm512_mask_adds_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_adds_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddsw
__m512i _mm512_maskz_adds_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_adds_epi16 (__mmask32 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddsw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddsb
__m128i _mm_adds_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_adds_epi8 (__m128i a, __m128i b)
#include <emmintrin.h>
Instruction: paddsb xmm, xmm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: paddsb xmm, xmm
CPUID Flags: SSE2
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpaddsb
__m128i _mm_mask_adds_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_adds_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddsb
__m128i _mm_maskz_adds_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_adds_epi8 (__mmask16 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddsb
__m256i _mm256_adds_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_adds_epi8 (__m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddsb ymm, ymm, ymm
CPUID Flags: AVX2
#include <immintrin.h>
Instruction: vpaddsb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:256] := 0
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
vpaddsb
__m256i _mm256_mask_adds_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_adds_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddsb
__m256i _mm256_maskz_adds_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_adds_epi8 (__mmask32 k, __m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddsb
__m512i _mm512_adds_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_adds_epi8 (__m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:512] := 0
vpaddsb
__m512i _mm512_mask_adds_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_adds_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddsb
__m512i _mm512_maskz_adds_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_adds_epi8 (__mmask64 k, __m512i a, __m512i b)
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512BW
#include <immintrin.h>
Instruction: vpaddsb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddusw
__m128i _mm_adds_epu16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_adds_epu16 (__m128i a, __m128i b)
#include <emmintrin.h>
Instruction: paddusw xmm, xmm
CPUID Flags: SSE2
#include <emmintrin.h>
Instruction: paddusw xmm, xmm
CPUID Flags: SSE2
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
Performance
Architecture | Latency | Throughput (CPI) |
---|---|---|
Skylake | 1 | 0.5 |
Broadwell | 1 | 0.5 |
Haswell | 1 | 0.5 |
Ivy Bridge | 1 | 0.5 |
vpaddusw
__m128i _mm_mask_adds_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_adds_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include <immintrin.h>
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW
#include <immintrin.h>
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0