添加链接
link之家
链接快照平台
  • 输入网页链接,自动生成快照
  • 标签化管理网页链接

重要参考: http://clang.llvm.org/doxygen/immintrin_8h.html

https://blog.csdn.net/fengbingchun/article/details/23598709?utm_source=blogxgwz0

SIMD相关头文件包括:

//#include <ivec.h>    //MMX
//#include <fvec.h>    //SSE(also include ivec.h)
//#include <dvec.h>    //SSE2(also include fvec.h)
#include <mmintrin.h>  //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h> //SSSE3(include pmmintrin.h)
#include <smmintrin.h> //SSE4.1(include tmmintrin.h)
#include <nmmintrin.h> //SSE4.2(include smmintrin.h)
#include <wmmintrin.h> //AES(include nmmintrin.h)
#include <immintrin.h> //AVX(include wmmintrin.h)
#include <intrin.h>    //(include immintrin.h)

mmintrin.h为MMX 头文件,其中__m64的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
    unsigned __int64    m64_u64;
    float               m64_f32[2];
    __int8              m64_i8[8];
    __int16             m64_i16[4];
    __int32             m64_i32[2];    
    __int64             m64_i64;
    unsigned __int8     m64_u8[8];
    unsigned __int16    m64_u16[4];
    unsigned __int32    m64_u32[2];
} __m64;

xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;

emmintrin.h为SSE2头文件,此头文件里包含SSE头文件,其中__m128i 和__m128d 的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];    
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;
typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {
    double              m128d_f64[2];
} __m128d;

immintrin.h为AVX头文件,此头文件里包含AES头文件,其中__m256、__m256d、__m256i的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(32) __m256 { 
    float m256_f32[8];
} __m256;
typedef struct __declspec(intrin_type) _CRT_ALIGN(32) {
    double m256d_f64[4]; 
} __m256d; 
typedef union  __declspec(intrin_type) _CRT_ALIGN(32) __m256i {
    __int8              m256i_i8[32];
    __int16             m256i_i16[16];
    __int32             m256i_i32[8];
    __int64             m256i_i64[4];
    unsigned __int8     m256i_u8[32];
    unsigned __int16    m256i_u16[16];
    unsigned __int32    m256i_u32[8];
    unsigned __int64    m256i_u64[4];
} __m256i;

immintrin.h文件中各函数的介绍:

* Add Packed Double Precision Floating-Point Values * **** VADDPD ymm1, ymm2, ymm3/m256 * Performs an SIMD add of the four packed double-precision floating-point * values from the first source operand to the second source operand, and * stores the packed double-precision floating-point results in the * destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=m10+m20, r1=m11+m21, r2=m12+m22, r3=m13+m23 extern __m256d __cdecl _mm256_add_pd(__m256d m1, __m256d m2); * Add Packed Single Precision Floating-Point Values * **** VADDPS ymm1, ymm2, ymm3/m256 * Performs an SIMD add of the eight packed single-precision floating-point * values from the first source operand to the second source operand, and * stores the packed single-precision floating-point results in the * destination //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27) //则r0=m10+m20, r1=m11+m21, ..., r7=m17+m27 extern __m256 __cdecl _mm256_add_ps(__m256 m1, __m256 m2); * Add/Subtract Double Precision Floating-Point Values * **** VADDSUBPD ymm1, ymm2, ymm3/m256 * Adds odd-numbered double-precision floating-point values of the first * source operand with the corresponding double-precision floating-point * values from the second source operand; stores the result in the odd-numbered * values of the destination. Subtracts the even-numbered double-precision * floating-point values from the second source operand from the corresponding * double-precision floating values in the first source operand; stores the * result into the even-numbered values of the destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=m10-m20, r1=m11+m21, r2=m12-m22, r3=m13-m23 extern __m256d __cdecl _mm256_addsub_pd(__m256d m1, __m256d m2); * Add/Subtract Packed Single Precision Floating-Point Values * **** VADDSUBPS ymm1, ymm2, ymm3/m256 * Adds odd-numbered single-precision floating-point values of the first source * operand with the corresponding single-precision floating-point values from * the second source operand; stores the result in the odd-numbered values of * the destination. Subtracts the even-numbered single-precision floating-point * values from the second source operand from the corresponding * single-precision floating values in the first source operand; stores the * result into the even-numbered values of the destination //m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27) //则r0=m10-m20, r1=m11+m21, ... , r6=m16-m26, r7=m17+m27 extern __m256 __cdecl _mm256_addsub_ps(__m256 m1, __m256 m2); * Bitwise Logical AND of Packed Double Precision Floating-Point Values * **** VANDPD ymm1, ymm2, ymm3/m256 * Performs a bitwise logical AND of the four packed double-precision * floating-point values from the first source operand and the second * source operand, and stores the result in the destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=(m10 & m20), r1=(m11 & m21), r2=(m12 & m22), r3=(m13 & m23) extern __m256d __cdecl _mm256_and_pd(__m256d m1, __m256d m2); * Bitwise Logical AND of Packed Single Precision Floating-Point Values * **** VANDPS ymm1, ymm2, ymm3/m256 * Performs a bitwise logical AND of the eight packed single-precision * floating-point values from the first source operand and the second * source operand, and stores the result in the destination //m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27) //则r0=(m10 & m20), r1=(m11 & m21), ..., r6=(m16 & m26), r7=(m17 & m27) extern __m256 __cdecl _mm256_and_ps(__m256 m1, __m256 m2); * Bitwise Logical AND NOT of Packed Double Precision Floating-Point Values * **** VANDNPD ymm1, ymm2, ymm3/m256 * Performs a bitwise logical AND NOT of the four packed double-precision * floating-point values from the first source operand and the second source * operand, and stores the result in the destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=(~m10) & m20, r1=(~m11) & m21, r2=(~m12) & m22, r3=(~m13) & m23 extern __m256d __cdecl _mm256_andnot_pd(__m256d m1, __m256d m2); * Bitwise Logical AND NOT of Packed Single Precision Floating-Point Values * **** VANDNPS ymm1, ymm2, ymm3/m256 * Performs a bitwise logical AND NOT of the eight packed single-precision * floating-point values from the first source operand and the second source * operand, and stores the result in the destination //m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27) //则r0=(~m10) & m20, r1=(~m11) & m21), ..., r6=(~m16) & m26, r7=(~m17) & m27 extern __m256 __cdecl _mm256_andnot_ps(__m256 m1, __m256 m2); * Blend Packed Double Precision Floating-Point Values * **** VBLENDPD ymm1, ymm2, ymm3/m256, imm8 * Double-Precision Floating-Point values from the second source operand are * conditionally merged with values from the first source operand and written * to the destination. The immediate bits [3:0] determine whether the * corresponding Double-Precision Floating Point value in the destination is * copied from the second source or first source. If a bit in the mask, * orresponding to a word, is "1", then the Double-Precision Floating-Point * value in the second source operand is copied, else the value in the first * source operand is copied //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23), mask=[b3 b2 b1 b0] //如果bn=1,则rn=m2n,如果bn=0, 则rn=m1n, 其中n为0,1,2,3 extern __m256d __cdecl _mm256_blend_pd(__m256d m1, __m256d m2, const int mask); * Blend Packed Single Precision Floating-Point Values * **** VBLENDPS ymm1, ymm2, ymm3/m256, imm8 * Single precision floating point values from the second source operand are * conditionally merged with values from the first source operand and written * to the destination. The immediate bits [7:0] determine whether the * corresponding single precision floating-point value in the destination is * copied from the second source or first source. If a bit in the mask, * corresponding to a word, is "1", then the single-precision floating-point * value in the second source operand is copied, else the value in the first * source operand is copied //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27),mask=[b7 b6...b1 b0] //如果bn=1,则rn=m2n,如果bn=0, 则rn=m1n, 其中n为0,1,2,3,4,5,6,7 extern __m256 __cdecl _mm256_blend_ps(__m256 m1, __m256 m2, const int mask); * Blend Packed Double Precision Floating-Point Values * **** VBLENDVPD ymm1, ymm2, ymm3/m256, ymm4 * Conditionally copy each quadword data element of double-precision * floating-point value from the second source operand (third operand) and the * first source operand (second operand) depending on mask bits defined in the * mask register operand (fourth operand). extern __m256d __cdecl _mm256_blendv_pd(__m256d m1, __m256d m2, __m256d m3); * Blend Packed Single Precision Floating-Point Values * **** VBLENDVPS ymm1, ymm2, ymm3/m256, ymm4 * Conditionally copy each dword data element of single-precision * floating-point value from the second source operand (third operand) and the * first source operand (second operand) depending on mask bits defined in the * mask register operand (fourth operand). extern __m256 __cdecl _mm256_blendv_ps(__m256 m1, __m256 m2, __m256 mask); * Divide Packed Double-Precision Floating-Point Values * **** VDIVPD ymm1, ymm2, ymm3/m256 * Performs an SIMD divide of the four packed double-precision floating-point * values in the first source operand by the four packed double-precision * floating-point values in the second source operand //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=m10/m20, r1=m11/m21, r2=m12/m22, r3=m13/m23 extern __m256d __cdecl _mm256_div_pd(__m256d m1, __m256d m2); * Divide Packed Single-Precision Floating-Point Values * **** VDIVPS ymm1, ymm2, ymm3/m256 * Performs an SIMD divide of the eight packed single-precision * floating-point values in the first source operand by the eight packed * single-precision floating-point values in the second source operand //m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27) //则r0=m10/m20, r1=m11/m21, ..., r6=m16/m26, r7=m17/m27 extern __m256 __cdecl _mm256_div_ps(__m256 m1, __m256 m2); * Dot Product of Packed Single-Precision Floating-Point Values * **** VDPPS ymm1, ymm2, ymm3/m256, imm8 * Multiplies the packed single precision floating point values in the * first source operand with the packed single-precision floats in the * second source. Each of the four resulting single-precision values is * conditionally summed depending on a mask extracted from the high 4 bits * of the immediate operand. This sum is broadcast to each of 4 positions * in the destination if the corresponding bit of the mask selected from * the low 4 bits of the immediate operand is "1". If the corresponding * low bit 0-3 of the mask is zero, the destination is set to zero. * The process is replicated for the high elements of the destination. //m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27) //mask=[b7 b6 ... b0], mask的低四位决定结果值是0,还是m1和m2相应位相乘后再求和 //若b0b1b2b3为0001,则r0=r1=r2=0,m4=m5=m6=0,此时如果b4b5b6b7为1001, //则r3=m10*m20+m13*m23, r7=m14*m24+m17*m27,其它依次类推 extern __m256 __cdecl _mm256_dp_ps(__m256 m1, __m256 m2, const int mask); * Add Horizontal Double Precision Floating-Point Values * **** VHADDPD ymm1, ymm2, ymm3/m256 * Adds pairs of adjacent double-precision floating-point values in the * first source operand and second source operand and stores results in * the destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=m10+m11, r1=m20+m21, r2=m12+m13, r3=m22+m23 extern __m256d __cdecl _mm256_hadd_pd(__m256d m1, __m256d m2); * Add Horizontal Single Precision Floating-Point Values * **** VHADDPS ymm1, ymm2, ymm3/m256 * Adds pairs of adjacent single-precision floating-point values in the * first source operand and second source operand and stores results in * the destination //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27) //则r0=m10+m11, r1=m12+m13, r2=m20+m21, r3=m22+m23, //r4=m14+m15, r5=m16+m17, r6=m24+m25, r7=m26+m27 extern __m256 __cdecl _mm256_hadd_ps(__m256 m1, __m256 m2); * Subtract Horizontal Double Precision Floating-Point Values * **** VHSUBPD ymm1, ymm2, ymm3/m256 * Subtract pairs of adjacent double-precision floating-point values in * the first source operand and second source operand and stores results * in the destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=m10-m11, r1=m20-m21, r2=m12-m13, r3=m22-m23 extern __m256d __cdecl _mm256_hsub_pd(__m256d m1, __m256d m2); * Subtract Horizontal Single Precision Floating-Point Values * **** VHSUBPS ymm1, ymm2, ymm3/m256 * Subtract pairs of adjacent single-precision floating-point values in * the first source operand and second source operand and stores results * in the destination. //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27) //则r0=m10-m11, r1=m12-m13, r2=m20-m21, r3=m22-m23, //r4=m14-m15, r5=m16-m17, r6=m24-m25, r7=m26-m27 extern __m256 __cdecl _mm256_hsub_ps(__m256 m1, __m256 m2); * Maximum of Packed Double Precision Floating-Point Values * **** VMAXPD ymm1, ymm2, ymm3/m256 * Performs an SIMD compare of the packed double-precision floating-point * values in the first source operand and the second source operand and * returns the maximum value for each pair of values to the destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=max(m10,m20), r1=max(m11,m21), r2=max(m12,m22), r3=max(m13,m23) extern __m256d __cdecl _mm256_max_pd(__m256d m1, __m256d m2); * Maximum of Packed Single Precision Floating-Point Values * **** VMAXPS ymm1, ymm2, ymm3/m256 * Performs an SIMD compare of the packed single-precision floating-point * values in the first source operand and the second source operand and * returns the maximum value for each pair of values to the destination //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27) //则r0=max(m10,m20), r1=max(m11,m21), ..., r6=max(m16,m26), r7=max(m17,m27) extern __m256 __cdecl _mm256_max_ps(__m256 m1, __m256 m2); * Minimum of Packed Double Precision Floating-Point Values * **** VMINPD ymm1, ymm2, ymm3/m256 * Performs an SIMD compare of the packed double-precision floating-point * values in the first source operand and the second source operand and * returns the minimum value for each pair of values to the destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=min(m10,m20), r1=min(m11,m21), r2=min(m12,m22), r3=min(m13,m23) extern __m256d __cdecl _mm256_min_pd(__m256d m1, __m256d m2); * Minimum of Packed Single Precision Floating-Point Values * **** VMINPS ymm1, ymm2, ymm3/m256 * Performs an SIMD compare of the packed single-precision floating-point * values in the first source operand and the second source operand and * returns the minimum value for each pair of values to the destination //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27) //则r0=min(m10,m20), r1=min(m11,m21), ..., r6=min(m16,m26), r7=min(m17,m27) extern __m256 __cdecl _mm256_min_ps(__m256 m1, __m256 m2); * Multiply Packed Double Precision Floating-Point Values * **** VMULPD ymm1, ymm2, ymm3/m256 * Performs a SIMD multiply of the four packed double-precision floating-point * values from the first Source operand to the Second Source operand, and * stores the packed double-precision floating-point results in the * destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=m10*m20, r1=m11*m21, r2=m12*m22, r3=m13*m23 extern __m256d __cdecl _mm256_mul_pd(__m256d m1, __m256d m2); * Multiply Packed Single Precision Floating-Point Values * **** VMULPS ymm1, ymm2, ymm3/m256 * Performs an SIMD multiply of the eight packed single-precision * floating-point values from the first source operand to the second source * operand, and stores the packed double-precision floating-point results in * the destination //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27) //则r0=m10*m20, r1=m11*m21, ..., r6=m16*m26, r7=m17*m27 extern __m256 __cdecl _mm256_mul_ps(__m256 m1, __m256 m2); * Bitwise Logical OR of Packed Double Precision Floating-Point Values * **** VORPD ymm1, ymm2, ymm3/m256 * Performs a bitwise logical OR of the four packed double-precision * floating-point values from the first source operand and the second * source operand, and stores the result in the destination //注意:有时得到的结果并不是m1和m2按位或的结果? extern __m256d __cdecl _mm256_or_pd(__m256d m1, __m256d m2); * Bitwise Logical OR of Packed Single Precision Floating-Point Values * **** VORPS ymm1, ymm2, ymm3/m256 * Performs a bitwise logical OR of the eight packed single-precision * floating-point values from the first source operand and the second * source operand, and stores the result in the destination //注意:有时得到的结果并不是m1和m2按位或的结果? extern __m256 __cdecl _mm256_or_ps(__m256 m1, __m256 m2); * Shuffle Packed Double Precision Floating-Point Values * **** VSHUFPD ymm1, ymm2, ymm3/m256, imm8 * Moves either of the two packed double-precision floating-point values from * each double quadword in the first source operand into the low quadword * of each double quadword of the destination; moves either of the two packed * double-precision floating-point values from the second source operand into * the high quadword of each double quadword of the destination operand. * The selector operand determines which values are moved to the destination extern __m256d __cdecl _mm256_shuffle_pd(__m256d m1, __m256d m2, const int select); * Shuffle Packed Single Precision Floating-Point Values * **** VSHUFPS ymm1, ymm2, ymm3/m256, imm8 * Moves two of the four packed single-precision floating-point values * from each double qword of the first source operand into the low * quadword of each double qword of the destination; moves two of the four * packed single-precision floating-point values from each double qword of * the second source operand into to the high quadword of each double qword * of the destination. The selector operand determines which values are moved * to the destination. extern __m256 __cdecl _mm256_shuffle_ps(__m256 m1, __m256 m2, const int select); * Subtract Packed Double Precision Floating-Point Values * **** VSUBPD ymm1, ymm2, ymm3/m256 * Performs an SIMD subtract of the four packed double-precision floating-point * values of the second Source operand from the first Source operand, and * stores the packed double-precision floating-point results in the destination //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r0=m10-m20, r1=m11-m21, r2=m12-m22, r3=m13-m23 extern __m256d __cdecl _mm256_sub_pd(__m256d m1, __m256d m2); * Subtract Packed Single Precision Floating-Point Values * **** VSUBPS ymm1, ymm2, ymm3/m256 * Performs an SIMD subtract of the eight packed single-precision * floating-point values in the second Source operand from the First Source * operand, and stores the packed single-precision floating-point results in * the destination //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27) //则r0=m10-m20, r1=m11-m21, ..., r6=m16-m26, r7=m17-m27 extern __m256 __cdecl _mm256_sub_ps(__m256 m1, __m256 m2); * Bitwise Logical XOR of Packed Double Precision Floating-Point Values * **** VXORPD ymm1, ymm2, ymm3/m256 * Performs a bitwise logical XOR of the four packed double-precision * floating-point values from the first source operand and the second * source operand, and stores the result in the destination //注意:有时得到的结果并不是m1和m2按位异或的结果? extern __m256d __cdecl _mm256_xor_pd(__m256d m1, __m256d m2); * Bitwise Logical XOR of Packed Single Precision Floating-Point Values * **** VXORPS ymm1, ymm2, ymm3/m256 * Performs a bitwise logical XOR of the eight packed single-precision * floating-point values from the first source operand and the second * source operand, and stores the result in the destination //注意:有时得到的结果并不是m1和m2按位异或的结果? extern __m256 __cdecl _mm256_xor_ps(__m256 m1, __m256 m2); * Compare Packed Double-Precision Floating-Point Values * **** VCMPPD xmm1, xmm2, xmm3/m128, imm8 * **** VCMPPD ymm1, ymm2, ymm3/m256, imm8 * Performs an SIMD compare of the four packed double-precision floating-point * values in the second source operand (third operand) and the first source * operand (second operand) and returns the results of the comparison to the * destination operand (first operand). The comparison predicate operand * (immediate) specifies the type of comparison performed on each of the pairs * of packed values. * For 128-bit intrinsic with compare predicate values in range 0-7 compiler * may generate SSE2 instructions if it is warranted for performance reasons. extern __m128d __cdecl _mm_cmp_pd(__m128d m1, __m128d m2, const int predicate); extern __m256d __cdecl _mm256_cmp_pd(__m256d m1, __m256d m2, const int predicate); * Compare Packed Single-Precision Floating-Point Values * **** VCMPPS xmm1, xmm2, xmm3/m256, imm8 * **** VCMPPS ymm1, ymm2, ymm3/m256, imm8 * Performs a SIMD compare of the packed single-precision floating-point values * in the second source operand (third operand) and the first source operand * (second operand) and returns the results of the comparison to the destination * operand (first operand). The comparison predicate operand (immediate) * specifies the type of comparison performed on each of the pairs of packed * values. * For 128-bit intrinsic with compare predicate values in range 0-7 compiler * may generate SSE2 instructions if it is warranted for performance reasons. extern __m128 __cdecl _mm_cmp_ps(__m128 m1, __m128 m2, const int predicate); extern __m256 __cdecl _mm256_cmp_ps(__m256 m1, __m256 m2, const int predicate); * Compare Scalar Double-Precision Floating-Point Values * **** VCMPSD xmm1, xmm2, xmm3/m64, imm8 * Compares the low double-precision floating-point values in the second source * operand (third operand) and the first source operand (second operand) and * returns the results in of the comparison to the destination operand (first * operand). The comparison predicate operand (immediate operand) specifies the * type of comparison performed. * For compare predicate values in range 0-7 compiler may generate SSE2 * instructions if it is warranted for performance reasons. extern __m128d __cdecl _mm_cmp_sd(__m128d m1, __m128d m2, const int predicate); * Compare Scalar Single-Precision Floating-Point Values * **** VCMPSS xmm1, xmm2, xmm3/m64, imm8 * Compares the low single-precision floating-point values in the second source * operand (third operand) and the first source operand (second operand) and * returns the results of the comparison to the destination operand (first * operand). The comparison predicate operand (immediate operand) specifies * the type of comparison performed. * For compare predicate values in range 0-7 compiler may generate SSE2 * instructions if it is warranted for performance reasons. extern __m128 __cdecl _mm_cmp_ss(__m128 m1, __m128 m2, const int predicate); * Convert Packed Doubleword Integers to * Packed Double-Precision Floating-Point Values * **** VCVTDQ2PD ymm1, xmm2/m128 * Converts four packed signed doubleword integers in the source operand to * four packed double-precision floating-point values in the destination //从__int32类型转换到double类型 extern __m256d __cdecl _mm256_cvtepi32_pd(__m128i m1); * Convert Packed Doubleword Integers to * Packed Single-Precision Floating-Point Values * **** VCVTDQ2PS ymm1, ymm2/m256 * Converts eight packed signed doubleword integers in the source operand to * eight packed double-precision floating-point values in the destination //从__int32类型转换到float类型 extern __m256 __cdecl _mm256_cvtepi32_ps(__m256i m1); * Convert Packed Double-Precision Floating-point values to * Packed Single-Precision Floating-Point Values * **** VCVTPD2PS xmm1, ymm2/m256 * Converts four packed double-precision floating-point values in the source * operand to four packed single-precision floating-point values in the * destination //从double类型转换到float类型 extern __m128 __cdecl _mm256_cvtpd_ps(__m256d m1); * Convert Packed Single Precision Floating-Point Values to * Packed Singed Doubleword Integer Values * **** VCVTPS2DQ ymm1, ymm2/m256 * Converts eight packed single-precision floating-point values in the source * operand to eight signed doubleword integers in the destination //从float类型转换到__int32类型 extern __m256i __cdecl _mm256_cvtps_epi32(__m256 m1); * Convert Packed Single Precision Floating-point values to * Packed Double Precision Floating-Point Values * **** VCVTPS2PD ymm1, xmm2/m128 * Converts four packed single-precision floating-point values in the source * operand to four packed double-precision floating-point values in the * destination //从float类型转换到double类型 extern __m256d __cdecl _mm256_cvtps_pd(__m128 m1); * Convert with Truncation Packed Double-Precision Floating-Point values to * Packed Doubleword Integers * **** VCVTTPD2DQ xmm1, ymm2/m256 * Converts four packed double-precision floating-point values in the source * operand to four packed signed doubleword integers in the destination. * When a conversion is inexact, a truncated (round toward zero) value is * returned. If a converted result is larger than the maximum signed doubleword * integer, the floating-point invalid exception is raised, and if this * exception is masked, the indefinite integer value (80000000H) is returned //从double类型转换到__int32类型,truncated extern __m128i __cdecl _mm256_cvttpd_epi32(__m256d m1); * Convert Packed Double-Precision Floating-point values to * Packed Doubleword Integers * **** VCVTPD2DQ xmm1, ymm2/m256 * Converts four packed double-precision floating-point values in the source * operand to four packed signed doubleword integers in the destination //从double类型转换到__int32类型 extern __m128i __cdecl _mm256_cvtpd_epi32(__m256d m1); * Convert with Truncation Packed Single Precision Floating-Point Values to * Packed Singed Doubleword Integer Values * **** VCVTTPS2DQ ymm1, ymm2/m256 * Converts eight packed single-precision floating-point values in the source * operand to eight signed doubleword integers in the destination. * When a conversion is inexact, a truncated (round toward zero) value is * returned. If a converted result is larger than the maximum signed doubleword * integer, the floating-point invalid exception is raised, and if this * exception is masked, the indefinite integer value (80000000H) is returned //从float类型转换到__int32类型,truncated extern __m256i __cdecl _mm256_cvttps_epi32(__m256 m1); * Extract packed floating-point values * **** VEXTRACTF128 xmm1/m128, ymm2, imm8 * Extracts 128-bits of packed floating-point values from the source operand * at an 128-bit offset from imm8[0] into the destination //offset:a constant integer value that represents the 128-bit offset from //where extraction must start //从256位中提取128位,offset决定提取的起始位置 extern __m128 __cdecl _mm256_extractf128_ps(__m256 m1, const int offset); extern __m128d __cdecl _mm256_extractf128_pd(__m256d m1, const int offset); extern __m128i __cdecl _mm256_extractf128_si256(__m256i m1, const int offset); * Zero All YMM registers * **** VZEROALL * Zeros contents of all YMM registers extern void __cdecl _mm256_zeroall(void); * Zero Upper bits of YMM registers * **** VZEROUPPER * Zeros the upper 128 bits of all YMM registers. The lower 128-bits of the * registers (the corresponding XMM registers) are unmodified extern void __cdecl _mm256_zeroupper(void); * Permute Single-Precision Floating-Point Values * **** VPERMILPS ymm1, ymm2, ymm3/m256 * **** VPERMILPS xmm1, xmm2, xmm3/m128 * Permute Single-Precision Floating-Point values in the first source operand * using 8-bit control fields in the low bytes of corresponding elements the * shuffle control and store results in the destination //control:a vector with 2-bit control fields, one for each corresponding element //of the source vector, for the 256-bit m1 source vector this control vector //contains eight 2-bit control fields,for the 128-bit m1 source vector this //control vector contains four 2-bit control fields extern __m256 __cdecl _mm256_permutevar_ps(__m256 m1, __m256i control); extern __m128 __cdecl _mm_permutevar_ps(__m128 a, __m128i control); * Permute Single-Precision Floating-Point Values * **** VPERMILPS ymm1, ymm2/m256, imm8 * **** VPERMILPS xmm1, xmm2/m128, imm8 * Permute Single-Precision Floating-Point values in the first source operand * using four 2-bit control fields in the 8-bit immediate and store results * in the destination //control:an integer specified as an 8-bit immediate;for the 256-bit m1 vector //this integer contains four 2-bit control fields in the low 8 bits of //the immediate, for the 128-bit m1 vector this integer contains two 2-bit //control fields in the low 4 bits of the immediate extern __m256 __cdecl _mm256_permute_ps(__m256 m1, int control); extern __m128 __cdecl _mm_permute_ps(__m128 a, int control); * Permute Double-Precision Floating-Point Values * **** VPERMILPD ymm1, ymm2, ymm3/m256 * **** VPERMILPD xmm1, xmm2, xmm3/m128 * Permute Double-Precision Floating-Point values in the first source operand * using 8-bit control fields in the low bytes of the second source operand * and store results in the destination //control:a vector with 1-bit control fields, one for each corresponding element //of the source vector, for the 256-bit m1 source vector this control vector //contains four 1-bit control fields in the low 4 bits of the immediate, for the //128-bit m1 source vector this control vector contains two 1-bit control fields //in the low 2 bits of the immediate extern __m256d __cdecl _mm256_permutevar_pd(__m256d m1, __m256i control); extern __m128d __cdecl _mm_permutevar_pd(__m128d a, __m128i control); * Permute Double-Precision Floating-Point Values * **** VPERMILPD ymm1, ymm2/m256, imm8 * **** VPERMILPD xmm1, xmm2/m128, imm8 * Permute Double-Precision Floating-Point values in the first source operand * using two, 1-bit control fields in the low 2 bits of the 8-bit immediate * and store results in the destination //control:an integer specified as an 8-bit immediate; for the 256-bit m1 vector //this integer contains four 1-bit control fields in the low 4 bits of the //immediate, for the 128-bit m1 vector this integer contains two 1-bit //control fields in the low 2 bits of the immediate extern __m256d __cdecl _mm256_permute_pd(__m256d m1, int control); extern __m128d __cdecl _mm_permute_pd(__m128d a, int control); * Permute Floating-Point Values * **** VPERM2F128 ymm1, ymm2, ymm3/m256, imm8 * Permute 128 bit floating-point-containing fields from the first source * operand and second source operand using bits in the 8-bit immediate and * store results in the destination //control:an immediate byte that specifies two 2-bit control fields and two //additional bits which specify zeroing behavior extern __m256 __cdecl _mm256_permute2f128_ps(__m256 m1, __m256 m2, int control); extern __m256d __cdecl _mm256_permute2f128_pd(__m256d m1, __m256d m2, int control); extern __m256i __cdecl _mm256_permute2f128_si256(__m256i m1, __m256i m2, int control); * Load with Broadcast * **** VBROADCASTSS ymm1, m32 * **** VBROADCASTSS xmm1, m32 * Load floating point values from the source operand and broadcast to all * elements of the destination //*a:pointer to a memory location that can hold constant 256-bit or //128-bit float32 values, 则r0=r1=...=rn=a[0] extern __m256 __cdecl _mm256_broadcast_ss(float const *a); extern __m128 __cdecl _mm_broadcast_ss(float const *a); * Load with Broadcast * **** VBROADCASTSD ymm1, m64 * Load floating point values from the source operand and broadcast to all * elements of the destination //则r0=r1=r2=r3=a[0] extern __m256d __cdecl _mm256_broadcast_sd(double const *a); * Load with Broadcast * **** VBROADCASTF128 ymm1, m128 * Load floating point values from the source operand and broadcast to all * elements of the destination //若*a为a[0],a[1],则r0=r2=a[0], r1=r3=a[1] extern __m256 __cdecl _mm256_broadcast_ps(__m128 const *a); extern __m256d __cdecl _mm256_broadcast_pd(__m128d const *a); * Insert packed floating-point values * **** VINSERTF128 ymm1, ymm2, xmm3/m128, imm8 * Performs an insertion of 128-bits of packed floating-point values from the * second source operand into an the destination at an 128-bit offset from * imm8[0]. The remaining portions of the destination are written by the * corresponding fields of the first source operand //offset:an integer value that represents the 128-bit offset //where the insertion must start //The remaining portions of the destination are written by the corresponding //elements of the first source vector, a extern __m256 __cdecl _mm256_insertf128_ps(__m256 a, __m128 b, int offset); extern __m256d __cdecl _mm256_insertf128_pd(__m256d a, __m128d b, int offset); extern __m256i __cdecl _mm256_insertf128_si256(__m256i a, __m128i b, int offset); * Move Aligned Packed Double-Precision Floating-Point Values * **** VMOVAPD ymm1, m256 * **** VMOVAPD m256, ymm1 * Moves 4 double-precision floating-point values from the source operand to * the destination //*a:the address must be 32-byte aligned extern __m256d __cdecl _mm256_load_pd(double const *a); extern void __cdecl _mm256_store_pd(double *a, __m256d b); * Move Aligned Packed Single-Precision Floating-Point Values * **** VMOVAPS ymm1, m256 * **** VMOVAPS m256, ymm1 * Moves 8 single-precision floating-point values from the source operand to * the destination //*a:the address must be 32-byte aligned extern __m256 __cdecl _mm256_load_ps(float const *a); extern void __cdecl _mm256_store_ps(float *a, __m256 b); * Move Unaligned Packed Double-Precision Floating-Point Values * **** VMOVUPD ymm1, m256 * **** VMOVUPD m256, ymm1 * Moves 256 bits of packed double-precision floating-point values from the * source operand to the destination //The address a does not need to be 32-byte aligned extern __m256d __cdecl _mm256_loadu_pd(double const *a); extern void __cdecl _mm256_storeu_pd(double *a, __m256d b); * Move Unaligned Packed Single-Precision Floating-Point Values * **** VMOVUPS ymm1, m256 * **** VMOVUPS m256, ymm1 * Moves 256 bits of packed single-precision floating-point values from the * source operand to the destination //The address a does not need to be 32-byte aligned extern __m256 __cdecl _mm256_loadu_ps(float const *a); extern void __cdecl _mm256_storeu_ps(float *a, __m256 b); * Move Aligned Packed Integer Values * **** VMOVDQA ymm1, m256 * **** VMOVDQA m256, ymm1 * Moves 256 bits of packed integer values from the source operand to the * destination //The address a does not need to be 32-byte aligned extern __m256i __cdecl _mm256_load_si256(__m256i const *a); extern void __cdecl _mm256_store_si256(__m256i *a, __m256i b); * Move Unaligned Packed Integer Values * **** VMOVDQU ymm1, m256 * **** VMOVDQU m256, ymm1 * Moves 256 bits of packed integer values from the source operand to the * destination //The address a does not need to be 32-byte aligned extern __m256i __cdecl _mm256_loadu_si256(__m256i const *a); extern void __cdecl _mm256_storeu_si256(__m256i *a, __m256i b); * Conditional SIMD Packed Loads and Stores * **** VMASKMOVPD xmm1, xmm2, m128 * **** VMASKMOVPD ymm1, ymm2, m256 * **** VMASKMOVPD m128, xmm1, xmm2 * **** VMASKMOVPD m256, ymm1, ymm2 * Load forms: * Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms) * memory location (third operand) into the destination XMM or YMM register * (first operand) using a mask in the first source operand (second operand). * Store forms: * Stores packed values from the XMM or YMM register in the second source * operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms) * memory location using a mask in first source operand (second operand). * Stores are atomic. //The mask is calculated from the most significant bit of each qword of the mask //register. If any of the bits of the mask is set to zero, the corresponding value //from the memory location is not loaded, and the corresponding field of the //destination vector is set to zero. extern __m256d __cdecl _mm256_maskload_pd(double const *a, __m256i mask); extern void __cdecl _mm256_maskstore_pd(double *a, __m256i mask, __m256d b); extern __m128d __cdecl _mm_maskload_pd(double const *a, __m128i mask); extern void __cdecl _mm_maskstore_pd(double *a, __m128i mask, __m128d b); * Conditional SIMD Packed Loads and Stores * **** VMASKMOVPS xmm1, xmm2, m128 * **** VMASKMOVPS ymm1, ymm2, m256 * **** VMASKMOVPS m128, xmm1, xmm2 * **** VMASKMOVPS m256, ymm1, ymm2 * Load forms: * Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms) * memory location (third operand) into the destination XMM or YMM register * (first operand) using a mask in the first source operand (second operand). * Store forms: * Stores packed values from the XMM or YMM register in the second source * operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms) * memory location using a mask in first source operand (second operand). * Stores are atomic. //The mask is calculated from the most significant bit of each dword of the mask //register. If any of the bits of the mask is set to zero, the corresponding //value from the memory location is not loaded, and the corresponding field of //the destination vector is set to zero. extern __m256 __cdecl _mm256_maskload_ps(float const *a, __m256i mask); extern void __cdecl _mm256_maskstore_ps(float *a, __m256i mask, __m256 b); extern __m128 __cdecl _mm_maskload_ps(float const *a, __m128i mask); extern void __cdecl _mm_maskstore_ps(float *a, __m128i mask, __m128 b); * Replicate Single-Precision Floating-Point Values * **** VMOVSHDUP ymm1, ymm2/m256 * Duplicates odd-indexed single-precision floating-point values from the * source operand //a=(a0, a1, a2, a3, a4, a5, a6, a7);则r=(a1, a1, a3, a3, a5, a5, a7, a7) extern __m256 __cdecl _mm256_movehdup_ps(__m256 a); * Replicate Single-Precision Floating-Point Values * **** VMOVSLDUP ymm1, ymm2/m256 * Duplicates even-indexed single-precision floating-point values from the * source operand //a=(a0, a1, a2, a3, a4, a5, a6, a7);则r=(a0, a0, a2, a2, a4, a4, a6, a6) extern __m256 __cdecl _mm256_moveldup_ps(__m256 a); * Replicate Double-Precision Floating-Point Values * **** VMOVDDUP ymm1, ymm2/m256 * Duplicates even-indexed double-precision floating-point values from the * source operand //a=(a0, a1, a2, a3), 则r=(a0, a0, a2, a2) extern __m256d __cdecl _mm256_movedup_pd(__m256d a); * Move Unaligned Integer * **** VLDDQU ymm1, m256 * The instruction is functionally similar to VMOVDQU YMM, m256 for loading * from memory. That is: 32 bytes of data starting at an address specified by * the source memory operand are fetched from memory and placed in a * destination //*a:points to a memory location from where unaligned integer value must be moved extern __m256i __cdecl _mm256_lddqu_si256(__m256i const *a); * Store Packed Integers Using Non-Temporal Hint * **** VMOVNTDQ m256, ymm1 * Moves the packed integers in the source operand to the destination using a * non-temporal hint to prevent caching of the data during the write to memory //the address must be 32-byte aligned extern void __cdecl _mm256_stream_si256(__m256i *p, __m256i a); * Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint * **** VMOVNTPD m256, ymm1 * Moves the packed double-precision floating-point values in the source * operand to the destination operand using a non-temporal hint to prevent * caching of the data during the write to memory //the address must be 32-byte aligned extern void __cdecl _mm256_stream_pd(double *p, __m256d a); * Store Packed Single-Precision Floating-Point Values Using Non-Temporal Hint * **** VMOVNTPS m256, ymm1 * Moves the packed single-precision floating-point values in the source * operand to the destination operand using a non-temporal hint to prevent * caching of the data during the write to memory //the address must be 32-byte aligned extern void __cdecl _mm256_stream_ps(float *p, __m256 a); * Compute Approximate Reciprocals of Packed Single-Precision Floating-Point Values * **** VRCPPS ymm1, ymm2/m256 * Performs an SIMD computation of the approximate reciprocals of the eight * packed single precision floating-point values in the source operand and * stores the packed single-precision floating-point results in the destination //a=(a0, a1, a2, ..., a6, a7); //则r=(1/a0, 1/a1, ..., 1/a6, 1/a7), 求倒数 extern __m256 __cdecl _mm256_rcp_ps(__m256 a); * Compute Approximate Reciprocals of Square Roots of * Packed Single-Precision Floating-point Values * **** VRSQRTPS ymm1, ymm2/m256 * Performs an SIMD computation of the approximate reciprocals of the square * roots of the eight packed single precision floating-point values in the * source operand and stores the packed single-precision floating-point results * in the destination //a=(a0, a1, a2, ..., a6, a7); //则r=(1/sqrt(a0), 1/sqrt(a1), ..., 1/sqrt(a6), 1/sqrt(a7)), 先开方再求倒数 extern __m256 __cdecl _mm256_rsqrt_ps(__m256 a); * Square Root of Double-Precision Floating-Point Values * **** VSQRTPD ymm1, ymm2/m256 * Performs an SIMD computation of the square roots of the two or four packed * double-precision floating-point values in the source operand and stores * the packed double-precision floating-point results in the destination //a=(a0, a1, a2, a3, a4);则r=(sqrt(a0),sqrt(a1), sqrt(a2), sqrt(a3)), 求开方 extern __m256d __cdecl _mm256_sqrt_pd(__m256d a); * Square Root of Single-Precision Floating-Point Values * **** VSQRTPS ymm1, ymm2/m256 * Performs an SIMD computation of the square roots of the eight packed * single-precision floating-point values in the source operand stores the * packed double-precision floating-point results in the destination //a=(a0, a1, a2, ..., a3, a4);则r=(sqrt(a0),sqrt(a1), ..., sqrt(a2), sqrt(a3)), 求开方 extern __m256 __cdecl _mm256_sqrt_ps(__m256 a); * Round Packed Double-Precision Floating-Point Values * **** VROUNDPD ymm1,ymm2/m256,imm8 * Round the four Double-Precision Floating-Point Values values in the source * operand by the rounding mode specified in the immediate operand and place * the result in the destination. The rounding process rounds the input to an * integral value and returns the result as a double-precision floating-point * value. The Precision Floating Point Exception is signaled according to the * immediate operand. If any source operand is an SNaN then it will be * converted to a QNaN. //a=(22.8, -11.3, -33.8, 4.3), //若iRoundMode=0x0A, 则r=(23, -11, -33, 5) //若iRoundMode=0x09, 则r=(22, -12, -34, 4) extern __m256d __cdecl _mm256_round_pd(__m256d a, int iRoundMode); #define _mm256_ceil_pd(val) _mm256_round_pd((val), 0x0A); #define _mm256_floor_pd(val) _mm256_round_pd((val), 0x09); * Round Packed Single-Precision Floating-Point Values * **** VROUNDPS ymm1,ymm2/m256,imm8 * Round the four single-precision floating-point values values in the source * operand by the rounding mode specified in the immediate operand and place * the result in the destination. The rounding process rounds the input to an * integral value and returns the result as a double-precision floating-point * value. The Precision Floating Point Exception is signaled according to the * immediate operand. If any source operand is an SNaN then it will be * converted to a QNaN. //用法与_mm256_round_pd相同 extern __m256 __cdecl _mm256_round_ps(__m256 a, int iRoundMode); #define _mm256_ceil_ps(val) _mm256_round_ps((val), 0x0A); #define _mm256_floor_ps(val) _mm256_round_ps((val), 0x09); * Unpack and Interleave High Packed Double-Precision Floating-Point Values * **** VUNPCKHPD ymm1,ymm2,ymm3/m256 * Performs an interleaved unpack of the high double-precision floating-point * values from the first source operand and the second source operand. //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r=(m11, m21, m13, m23) extern __m256d __cdecl _mm256_unpackhi_pd(__m256d m1, __m256d m2); * Unpack and Interleave High Packed Single-Precision Floating-Point Values * **** VUNPCKHPS ymm1,ymm2,ymm3 * Performs an interleaved unpack of the high single-precision floating-point * values from the first source operand and the second source operand //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27) //则r=(m12, m22, m13, m23, m16, m26, m17, m27) extern __m256 __cdecl _mm256_unpackhi_ps(__m256 m1, __m256 m2); * Unpack and Interleave Low Packed Double-Precision Floating-Point Values * **** VUNPCKLPD ymm1,ymm2,ymm3/m256 * Performs an interleaved unpack of the low double-precision floating-point * values from the first source operand and the second source operand //m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23) //则r=(m10, m20, m12, m22) extern __m256d __cdecl _mm256_unpacklo_pd(__m256d m1, __m256d m2); * Unpack and Interleave Low Packed Single-Precision Floating-Point Values * **** VUNPCKLPS ymm1,ymm2,ymm3 * Performs an interleaved unpack of the low single-precision floating-point * values from the first source operand and the second source operand //m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27) //则r=(m10, m20, m11, m21, m14, m24, m15, m25) extern __m256 __cdecl _mm256_unpacklo_ps(__m256 m1, __m256 m2); * Packed Bit Test * **** VPTEST ymm1, ymm2/m256 * VPTEST set the ZF flag if all bits in the result are 0 of the bitwise AND * of the first source operand and the second source operand. VPTEST sets the * CF flag if all bits in the result are 0 of the bitwise AND of the second * source operand and the logical NOT of the destination. extern int __cdecl _mm256_testz_si256(__m256i s1, __m256i s2); extern int __cdecl _mm256_testc_si256(__m256i s1, __m256i s2); extern int __cdecl _mm256_testnzc_si256(__m256i s1, __m256i s2); * Packed Bit Test * **** VTESTPD ymm1, ymm2/m256 * **** VTESTPD xmm1, xmm2/m128 * VTESTPD performs a bitwise comparison of all the sign bits of the * double-precision elements in the first source operation and corresponding * sign bits in the second source operand. If the AND of the two sets of bits * produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of * the source sign bits with the dest sign bits produces all zeros the CF is * set else the CF is clear extern int __cdecl _mm256_testz_pd(__m256d s1, __m256d s2); extern int __cdecl _mm256_testc_pd(__m256d s1, __m256d s2); extern int __cdecl _mm256_testnzc_pd(__m256d s1, __m256d s2); extern int __cdecl _mm_testz_pd(__m128d s1, __m128d s2); extern int __cdecl _mm_testc_pd(__m128d s1, __m128d s2); extern int __cdecl _mm_testnzc_pd(__m128d s1, __m128d s2); * Packed Bit Test * **** VTESTPS ymm1, ymm2/m256 * **** VTESTPS xmm1, xmm2/m128 * VTESTPS performs a bitwise comparison of all the sign bits of the packed * single-precision elements in the first source operation and corresponding * sign bits in the second source operand. If the AND of the two sets of bits * produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of * the source sign bits with the dest sign bits produces all zeros the CF is * set else the CF is clear extern int __cdecl _mm256_testz_ps(__m256 s1, __m256 s2); extern int __cdecl _mm256_testc_ps(__m256 s1, __m256 s2); extern int __cdecl _mm256_testnzc_ps(__m256 s1, __m256 s2); extern int __cdecl _mm_testz_ps(__m128 s1, __m128 s2); extern int __cdecl _mm_testc_ps(__m128 s1, __m128 s2); extern int __cdecl _mm_testnzc_ps(__m128 s1, __m128 s2); * Extract Double-Precision Floating-Point Sign mask * **** VMOVMSKPD r32, ymm2 * Extracts the sign bits from the packed double-precision floating-point * values in the source operand, formats them into a 4-bit mask, and stores * the mask in the destination extern int __cdecl _mm256_movemask_pd(__m256d a); * Extract Single-Precision Floating-Point Sign mask * **** VMOVMSKPS r32, ymm2 * Extracts the sign bits from the packed single-precision floating-point * values in the source operand, formats them into a 8-bit mask, and stores * the mask in the destination extern int __cdecl _mm256_movemask_ps(__m256 a); * Return 256-bit vector with all elements set to 0 //则r0=r1=...=rn=0 extern __m256d __cdecl _mm256_setzero_pd(void); extern __m256 __cdecl _mm256_setzero_ps(void); extern __m256i __cdecl _mm256_setzero_si256(void); * Return 256-bit vector intialized to specified arguments //则r = (d, c, b, a) extern __m256d __cdecl _mm256_set_pd(double a, double b, double c, double d); extern __m256 __cdecl _mm256_set_ps(float, float, float, float, float, float, float, float); extern __m256i __cdecl _mm256_set_epi8(char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char); extern __m256i __cdecl _mm256_set_epi16(short, short, short, short, short, short, short, short, short, short, short, short, short, short, short, short); extern __m256i __cdecl _mm256_set_epi32(int, int, int, int, int, int, int, int); extern __m256i __cdecl _mm256_set_epi64x(long long, long long, long long, long long); //则r = (a, b, c, d) extern __m256d __cdecl _mm256_setr_pd(double a, double b, double c, double d); extern __m256 __cdecl _mm256_setr_ps(float, float, float, float, float, float, float, float); extern __m256i __cdecl _mm256_setr_epi8(char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char, char); extern __m256i __cdecl _mm256_setr_epi16(short, short, short, short, short, short, short, short, short, short, short, short, short, short, short, short); extern __m256i __cdecl _mm256_setr_epi32(int, int, int, int, int, int, int, int); extern __m256i __cdecl _mm256_setr_epi64x(long long, long long, long long, long long); * Return 256-bit vector with all elements intialized to specified scalar //则r0 = ... = rn = a extern __m256d __cdecl _mm256_set1_pd(double a); extern __m256 __cdecl _mm256_set1_ps(float); extern __m256i __cdecl _mm256_set1_epi8(char); extern __m256i __cdecl _mm256_set1_epi16(short); extern __m256i __cdecl _mm256_set1_epi32(int); extern __m256i __cdecl _mm256_set1_epi64x(long long); * Support intrinsics to do vector type casts. These intrinsics do not introduce * extra moves to generated code. When cast is done from a 128 to 256-bit type * the low 128 bits of the 256-bit result contain source parameter value; the * upper 128 bits of the result are undefined 类型转换 当从128位到256位类型进行转换时,256位结果的低128位包含源参数值; 结果的高128位是未定义的 extern __m256 __cdecl _mm256_castpd_ps(__m256d a); extern __m256d __cdecl _mm256_castps_pd(__m256 a); extern __m256i __cdecl _mm256_castps_si256(__m256 a); extern __m256i __cdecl _mm256_castpd_si256(__m256d a); extern __m256 __cdecl _mm256_castsi256_ps(__m256i a); extern __m256d __cdecl _mm256_castsi256_pd(__m256i a); extern __m128 __cdecl _mm256_castps256_ps128(__m256 a); extern __m128d __cdecl _mm256_castpd256_pd128(__m256d a); extern __m128i __cdecl _mm256_castsi256_si128(__m256i a); extern __m256 __cdecl _mm256_castps128_ps256(__m128 a); extern __m256d __cdecl _mm256_castpd128_pd256(__m128d a); extern __m256i __cdecl _mm256_castsi128_si256(__m128i a); /* Start of new intrinsics for Dev10 SP1 * The list of extended control registers. * Currently, the list includes only one register. #define _XCR_XFEATURE_ENABLED_MASK 0 /* Returns the content of the specified extended control register */ extern unsigned __int64 __cdecl _xgetbv(unsigned int ext_ctrl_reg); /* Writes the value to the specified extended control register */ extern void __cdecl _xsetbv(unsigned int ext_ctrl_reg, unsigned __int64 val); * Performs a full or partial save of the enabled processor state components * using the the specified memory address location and a mask. extern void __cdecl _xsave(void *mem, unsigned __int64 save_mask); extern void __cdecl _xsave64(void *mem, unsigned __int64 save_mask); * Performs a full or partial save of the enabled processor state components * using the the specified memory address location and a mask. * Optimize the state save operation if possible. extern void __cdecl _xsaveopt(void *mem, unsigned __int64 save_mask); extern void __cdecl _xsaveopt64(void *mem, unsigned __int64 save_mask); * Performs a full or partial restore of the enabled processor states * using the state information stored in the specified memory address location * and a mask. extern void __cdecl _xrstor(void *mem, unsigned __int64 restore_mask); extern void __cdecl _xrstor64(void *mem, unsigned __int64 restore_mask); * Saves the current state of the x87 FPU, MMX technology, XMM, * and MXCSR registers to the specified 512-byte memory location. extern void __cdecl _fxsave(void *mem); extern void __cdecl _fxsave64(void *mem); * Restore the current state of the x87 FPU, MMX technology, XMM, * and MXCSR registers from the specified 512-byte memory location. extern void __cdecl _fxrstor(void *mem); extern void __cdecl _fxrstor64(void *mem);
// stdafx.h : include file for standard system include files, // or project specific include files that are used frequently, but // are changed infrequently #if !defined(AFX_STDAFX_H__C4B5DA9B_21EA_47D6_9253_A4245E58FBF5__INCLUDED_) #define AFX_STDAFX_H__C4B5DA9B_21EA_47D6_9253_A4245E58FBF5__INCLUDED_ #if _MSC_VER > 1000 #pragma once #endif // _MSC_VER > 1000 // TODO: reference additional headers your program requires here //{{AFX_INSERT_LOCATION}} // Microsoft Visual C++ will insert additional declarations immediately before the previous line. #endif // !defined(AFX_STDAFX_H__C4B5DA9B_21EA_47D6_9253_A4245E58FBF5__INCLUDED_) //#include &lt;ivec.h&gt;//MMX //#include &lt;fvec.h&gt;//SSE(also include ivec.h) //#include &lt;dvec.h&gt;//SSE2(also include fvec.h) #include &lt;mmintrin.h&gt; //MMX #include &lt;... 在C/C++程序中,使用AVX2指令有很多种方法。 嵌入汇编是一般的方法,但是对于不熟悉汇编语言的人来说,有点勉为其难。 gcc编译支持AVX2指令的编程。程序中需要使用头文件和,这样通过调用其中定义的一些函数,达到使用AVX2指令的目的,即用C/C++调用SIMD指令(单指令多数据)。 这里给出的样例程序是有关浮点向量运算的例子。 其中函数_mm_add_epi32()实现的是整数向量( QMAKE_CXXFLAGS += -msse4.2 我用的平台是TX2,在linux环境下,没有相应的指令集,需要下载此指令转换包,将其放在工程文件夹下,并将解锁后的 sse2neon.h文件与工程文件放在同一文件夹下,并用: #include "sse2neon.h" 代替源文件中的: #include <immintrin.h. 阅读代码的时候遇到了__m128i、_mm_set1_epi8、_mm_loadu_si128、_mm_max_epu8、_mm_min_epu8、_mm_store_si128、_mm_unpackhi_epi8、_mm_adds_epi16、_mm_srli_si128等SIMD指令集,所以想着作一个总结。 0. SIMD基础知识 SIMD是单指令多数据技术,目前Intel处理器支持的SIMD技术包括MMX、SSE以及AVX。 MMX是MultiMedia eXtensions(多媒体扩展)的缩写,是 作者:zyl910   之前我整理了一份VC6至VC2010中Intrinsics函数集对应表。现在VS2012发布了,它有没有增加Intrinsics函数集呢?于是我对此进行检查。   若是64位win8系统中默认安装的VS2012,Intrinsics头文件位于“C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\include”目... C++数值计算简单加速技术(三)——SIMD SIMD全称Single Instruction Multiple Data,单指令多数据流,能够复制多个操作数,并把它们打包在大型寄存器的一组指令集。这个技术其实和GPU计算基本相似,就是几个执行部件同时访问内存,一次性获得所有操作数再进行运算,从而掩盖访存的时间开销,实现加速。因此只要能GPU并行的计算都可以通过SIMD优化,在CPU上获...