添加链接
link之家
链接快照平台
  • 输入网页链接,自动生成快照
  • 标签化管理网页链接

xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;
形式如下: extern __m128 _mm_add_ss(__m128 _A, __m128 _B); extern __m128 _mm_add_ps(__m128 _A, __m128 _B); extern __m128 _mm_sub_ss(__m128 _A, __m128 _B); extern __m128 _mm_sub_ps(__m128 _A, __m128 _B); extern __m128 _mm_mul_ss(__m128 _A, __m128 _B); extern __m128 _mm_mul_ps(__m128 _A, __m128 _B); extern __m128 _mm_div_ss(__m128 _A, __m128 _B); extern __m128 _mm_div_ps(__m128 _A, __m128 _B); extern __m128 _mm_sqrt_ss(__m128 _A); extern __m128 _mm_sqrt_ps(__m128 _A); extern __m128 _mm_rcp_ss(__m128 _A); extern __m128 _mm_rcp_ps(__m128 _A); extern __m128 _mm_rsqrt_ss(__m128 _A); extern __m128 _mm_rsqrt_ps(__m128 _A); extern __m128 _mm_min_ss(__m128 _A, __m128 _B); extern __m128 _mm_min_ps(__m128 _A, __m128 _B); extern __m128 _mm_max_ss(__m128 _A, __m128 _B); extern __m128 _mm_max_ps(__m128 _A, __m128 _B); * FP, logical extern __m128 _mm_and_ps(__m128 _A, __m128 _B); extern __m128 _mm_andnot_ps(__m128 _A, __m128 _B); extern __m128 _mm_or_ps(__m128 _A, __m128 _B); extern __m128 _mm_xor_ps(__m128 _A, __m128 _B); * FP, comparison extern __m128 _mm_cmpeq_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpeq_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmplt_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmplt_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmple_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmple_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmpgt_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpgt_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmpge_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpge_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmpneq_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpneq_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmpnlt_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpnlt_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmpnle_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpnle_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmpngt_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpngt_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmpnge_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpnge_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmpord_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpord_ps(__m128 _A, __m128 _B); extern __m128 _mm_cmpunord_ss(__m128 _A, __m128 _B); extern __m128 _mm_cmpunord_ps(__m128 _A, __m128 _B); extern int _mm_comieq_ss(__m128 _A, __m128 _B); extern int _mm_comilt_ss(__m128 _A, __m128 _B); extern int _mm_comile_ss(__m128 _A, __m128 _B); extern int _mm_comigt_ss(__m128 _A, __m128 _B); extern int _mm_comige_ss(__m128 _A, __m128 _B); extern int _mm_comineq_ss(__m128 _A, __m128 _B); extern int _mm_ucomieq_ss(__m128 _A, __m128 _B); extern int _mm_ucomilt_ss(__m128 _A, __m128 _B); extern int _mm_ucomile_ss(__m128 _A, __m128 _B); extern int _mm_ucomigt_ss(__m128 _A, __m128 _B); extern int _mm_ucomige_ss(__m128 _A, __m128 _B); extern int _mm_ucomineq_ss(__m128 _A, __m128 _B); * FP, conversions extern int _mm_cvt_ss2si(__m128 _A); extern __m64 _mm_cvt_ps2pi(__m128 _A); extern int _mm_cvtt_ss2si(__m128 _A); extern __m64 _mm_cvtt_ps2pi(__m128 _A); extern __m128 _mm_cvt_si2ss(__m128, int); extern __m128 _mm_cvt_pi2ps(__m128, __m64); extern float _mm_cvtss_f32(__m128 _A); * Support for 64-bit extension intrinsics #if defined(_M_X64) extern __int64 _mm_cvtss_si64(__m128 _A); extern __int64 _mm_cvttss_si64(__m128 _A); extern __m128 _mm_cvtsi64_ss(__m128 _A, __int64 _B); #endif * FP, misc extern __m128 _mm_shuffle_ps(__m128 _A, __m128 _B, unsigned int _Imm8); extern __m128 _mm_unpackhi_ps(__m128 _A, __m128 _B); extern __m128 _mm_unpacklo_ps(__m128 _A, __m128 _B); extern __m128 _mm_loadh_pi(__m128, __m64 const*); extern __m128 _mm_movehl_ps(__m128, __m128); extern __m128 _mm_movelh_ps(__m128, __m128); extern void _mm_storeh_pi(__m64 *, __m128); extern __m128 _mm_loadl_pi(__m128, __m64 const*); extern void _mm_storel_pi(__m64 *, __m128); extern int _mm_movemask_ps(__m128 _A); * Integer extensions extern int _m_pextrw(__m64, int); extern __m64 _m_pinsrw(__m64, int, int); extern __m64 _m_pmaxsw(__m64, __m64); extern __m64 _m_pmaxub(__m64, __m64); extern __m64 _m_pminsw(__m64, __m64); extern __m64 _m_pminub(__m64, __m64); extern int _m_pmovmskb(__m64); extern __m64 _m_pmulhuw(__m64, __m64); extern __m64 _m_pshufw(__m64, int); extern void _m_maskmovq(__m64, __m64, char *); extern __m64 _m_pavgb(__m64, __m64); extern __m64 _m_pavgw(__m64, __m64); extern __m64 _m_psadbw(__m64, __m64); * memory & initialization extern __m128 _mm_set_ss(float _A); extern __m128 _mm_set_ps1(float _A); extern __m128 _mm_set_ps(float _A, float _B, float _C, float _D); extern __m128 _mm_setr_ps(float _A, float _B, float _C, float _D); extern __m128 _mm_setzero_ps(void); extern __m128 _mm_load_ss(float const*_A); extern __m128 _mm_load_ps1(float const*_A); extern __m128 _mm_load_ps(float const*_A); extern __m128 _mm_loadr_ps(float const*_A); extern __m128 _mm_loadu_ps(float const*_A); extern void _mm_store_ss(float *_V, __m128 _A); extern void _mm_store_ps1(float *_V, __m128 _A); extern void _mm_store_ps(float *_V, __m128 _A); extern void _mm_storer_ps(float *_V, __m128 _A); extern void _mm_storeu_ps(float *_V, __m128 _A); extern void _mm_prefetch(char const*_A, int _Sel); extern void _mm_stream_pi(__m64 *, __m64); extern void _mm_stream_ps(float *, __m128); extern __m128 _mm_move_ss(__m128 _A, __m128 _B); extern void _mm_sfence(void); extern unsigned int _mm_getcsr(void); extern void _mm_setcsr(unsigned int); #ifdef __ICL extern void* __cdecl _mm_malloc(size_t _Siz, size_t _Al); extern void __cdecl _mm_free(void *_P); #endif /* Alternate intrinsic names definition */ #define _mm_cvtss_si32 _mm_cvt_ss2si #define _mm_cvtps_pi32 _mm_cvt_ps2pi #define _mm_cvttss_si32 _mm_cvtt_ss2si #define _mm_cvttps_pi32 _mm_cvtt_ps2pi #define _mm_cvtsi32_ss _mm_cvt_si2ss #define _mm_cvtpi32_ps _mm_cvt_pi2ps #define _mm_extract_pi16 _m_pextrw #define _mm_insert_pi16 _m_pinsrw #define _mm_max_pi16 _m_pmaxsw #define _mm_max_pu8 _m_pmaxub #define _mm_min_pi16 _m_pminsw #define _mm_min_pu8 _m_pminub #define _mm_movemask_pi8 _m_pmovmskb #define _mm_mulhi_pu16 _m_pmulhuw #define _mm_shuffle_pi16 _m_pshufw #define _mm_maskmove_si64 _m_maskmovq #define _mm_avg_pu8 _m_pavgb #define _mm_avg_pu16 _m_pavgw #define _mm_sad_pu8 _m_psadbw #define _mm_set1_ps _mm_set_ps1 #define _mm_load1_ps _mm_load_ps1 #define _mm_store1_ps _mm_store_ps1
/***********************************************************/
 /*  NAME : _mm_cvtpu16_ps                                  */
 /*  DESCRIPTION : Convert 4 16-bit unsigned integer values */
 /*                to 4 single-precision float values       */
 /*  IN : __m64 a                                           */
 /*  OUT : none                                             */
 /*  RETURN : __m128 : (float)a                             */
 /***********************************************************/
__inline __m128 _mm_cvtpu16_ps(__m64 a)
  __m128 tmp;
  __m64  ext_val = _mm_setzero_si64();
  tmp = _mm_cvtpi32_ps(_mm_setzero_ps(), _mm_unpackhi_pi16(a, ext_val));
  return(_mm_cvtpi32_ps(_mm_movelh_ps(tmp, tmp), 
                        _mm_unpacklo_pi16(a, ext_val)));
 /******************************************************/
 /*  NAME : _mm_cvtps_pi16                             */
 /*  DESCRIPTION : Convert 4 single-precision float    */
 /*                values to 4 16-bit integer values   */
 /*  IN : __m128 a                                     */
 /*  OUT : none                                        */
 /*  RETURN : __m64 : (short)a                         */
 /******************************************************/
__inline __m64 _mm_cvtps_pi16(__m128 a)
  return _mm_packs_pi32(_mm_cvtps_pi32(a), 
                        _mm_cvtps_pi32(_mm_movehl_ps(a, a)));
 /******************************************************/
 /*  NAME : _mm_cvtpi8_ps                              */
 /*  DESCRIPTION : Convert 4 8-bit integer values to 4 */
 /*                single-precision float values       */
 /*  IN : __m64 a                                      */
 /*  OUT : none                                        */
 /*  RETURN : __m128 : (float)a                        */
 /******************************************************/
__inline __m128 _mm_cvtpi8_ps(__m64 a)
  __m64  ext_val = _mm_cmpgt_pi8(_mm_setzero_si64(), a);
  return _mm_cvtpi16_ps(_mm_unpacklo_pi8(a, ext_val));
 /******************************************************/
 /*  NAME : _mm_cvtpu8_ps                              */
 /*  DESCRIPTION : Convert 4 8-bit unsigned integer    */
 /*                values to 4 single-precision float  */
 /*                values                              */
 /*  IN : __m64 a                                      */
 /*  OUT : none                                        */
 /*  RETURN : __m128 : (float)a                        */
 /******************************************************/
__inline __m128 _mm_cvtpu8_ps(__m64 a)
  return _mm_cvtpu16_ps(_mm_unpacklo_pi8(a, _mm_setzero_si64()));
 /******************************************************/
 /*  NAME : _mm_cvtps_pi8                              */
 /*  DESCRIPTION : Convert 4 single-precision float    */
 /*                values to 4 8-bit integer values    */
 /*  IN : __m128 a                                     */
 /*  OUT : none                                        */
 /*  RETURN : __m64 : (char)a                          */
 /******************************************************/
__inline __m64 _mm_cvtps_pi8(__m128 a)
  return _mm_packs_pi16(_mm_cvtps_pi16(a), _mm_setzero_si64());
 /******************************************************/
 /*  NAME : _mm_cvtpi32x2_ps                           */
 /*  DESCRIPTION : Convert 4 32-bit integer values     */
 /*                to 4 single-precision float values  */
 /*  IN : __m64 a : operand 1                          */
 /*       __m64 b : operand 2                          */
 /*  OUT : none                                        */
 /*  RETURN : __m128 : (float)a,(float)b               */
 /******************************************************/
__inline __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
  return _mm_movelh_ps(_mm_cvt_pi2ps(_mm_setzero_ps(), a), 
                       _mm_cvt_pi2ps(_mm_setzero_ps(), b)); 

这一块贴开始的一段函数:
/*******************************************************/
 /* MACRO for shuffle parameter for _mm_shuffle_ps().   */
 /* Argument fp3 is a digit[0123] that represents the fp*/
 /* from argument "b" of mm_shuffle_ps that will be     */
 /* placed in fp3 of result. fp2 is the same for fp2 in */
 /* result. fp1 is a digit[0123] that represents the fp */
 /* from argument "a" of mm_shuffle_ps that will be     */
 /* places in fp1 of result. fp0 is the same for fp0 of */
 /* result                                              */
 /*******************************************************/
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \
                                     ((fp1) << 2) | ((fp0)))
 /*******************************************************/
 /* MACRO for performing the transpose of a 4x4 matrix  */
 /* of single precision floating point values.          */
 /* Arguments row0, row1, row2, and row3 are __m128     */
 /* values whose elements form the corresponding rows   */
 /* of a 4x4 matrix.  The matrix transpose is returned  */
 /* in arguments row0, row1, row2, and row3 where row0  */
 /* now holds column 0 of the original matrix, row1 now */
 /* holds column 1 of the original matrix, etc.         */
 /*******************************************************/
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) {                 \
            __m128 tmp3, tmp2, tmp1, tmp0;                          \
            tmp0   = _mm_shuffle_ps((row0), (row1), 0x44);          \
            tmp2   = _mm_shuffle_ps((row0), (row1), 0xEE);          \
            tmp1   = _mm_shuffle_ps((row2), (row3), 0x44);          \
            tmp3   = _mm_shuffle_ps((row2), (row3), 0xEE);          \
            (row0) = _mm_shuffle_ps(tmp0, tmp1, 0x88);              \
            (row1) = _mm_shuffle_ps(tmp0, tmp1, 0xDD);              \
            (row2) = _mm_shuffle_ps(tmp2, tmp3, 0x88);              \
            (row3) = _mm_shuffle_ps(tmp2, tmp3, 0xDD);              \
/* constants for use with _mm_prefetch */
#define _MM_HINT_T0     1
#define _MM_HINT_T1     2
#define _MM_HINT_T2     3
#define _MM_HINT_NTA    0
/* (this declspec not supported with 0.A or 0.B) */
#define _MM_ALIGN16 _CRT_ALIGN(16)
/* MACRO functions for setting and reading the MXCSR */
#define _MM_EXCEPT_MASK       0x003f
#define _MM_EXCEPT_INVALID    0x0001
#define _MM_EXCEPT_DENORM     0x0002
#define _MM_EXCEPT_DIV_ZERO   0x0004
#define _MM_EXCEPT_OVERFLOW   0x0008
#define _MM_EXCEPT_UNDERFLOW  0x0010
#define _MM_EXCEPT_INEXACT    0x0020
#define _MM_MASK_MASK         0x1f80
#define _MM_MASK_INVALID      0x0080
#define _MM_MASK_DENORM       0x0100
#define _MM_MASK_DIV_ZERO     0x0200
#define _MM_MASK_OVERFLOW     0x0400
#define _MM_MASK_UNDERFLOW    0x0800
#define _MM_MASK_INEXACT      0x1000
#define _MM_ROUND_MASK        0x6000
#define _MM_ROUND_NEAREST     0x0000
#define _MM_ROUND_DOWN        0x2000
#define _MM_ROUND_UP          0x4000
#define _MM_ROUND_TOWARD_ZERO 0x6000
#define _MM_FLUSH_ZERO_MASK   0x8000
#define _MM_FLUSH_ZERO_ON     0x8000
#define _MM_FLUSH_ZERO_OFF    0x0000
#define _MM_SET_EXCEPTION_STATE(mask)                               \
            _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (mask))
#define _MM_GET_EXCEPTION_STATE()                                   \
            (_mm_getcsr() & _MM_EXCEPT_MASK)
#define _MM_SET_EXCEPTION_MASK(mask)                                \
            _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (mask))
#define _MM_GET_EXCEPTION_MASK()                                    \
            (_mm_getcsr() & _MM_MASK_MASK)
#define _MM_SET_ROUNDING_MODE(mode)                                 \
            _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (mode))
#define _MM_GET_ROUNDING_MODE()                                     \
            (_mm_getcsr() & _MM_ROUND_MASK)
#define _MM_SET_FLUSH_ZERO_MODE(mode)                               \
            _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (mode))
#define _MM_GET_FLUSH_ZERO_MODE()                                   \
            (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 { float m128_f32[4]; unsigned __int64 m128_u64[2]; __int8 This file is part of GCC. GCC is free software; you can redistribu... AC / C ++头文件,可将Intel SSE内部函数转换为ARN NEON内部函数。 英特尔的SIMD指令集(称为SSE)在许多应用程序中用于提高性能。 ARM还向其处理器引入了称为Neon的SIMD指令集。 重写为SSE编写的代码以在Neon上工作非常耗时。 这是一个头文件,可以将某些SSE限制自动转换为NEON限制。 将SSE2NEON.h文件放入您的源代码目录中。 找到代码中包含的以下SSE头文件: #include <xmmintrin> #include <emmintrin> 将它们替换为: #include "SSE2NEON.h" 在Linux上,使用以
阅读代码的时候遇到了__m128i、_mm_set1_epi8、_mm_loadu_si128、_mm_max_epu8、_mm_min_epu8、_mm_store_si128、_mm_unpackhi_epi8、_mm_adds_epi16、_mm_srli_si128等SIMD指令集,所以想着作一个总结。 0. SIMD基础知识 SIMD是单指令多数据技术,目前Intel处理器支持的SIMD技术包括MMX、SSE以及AVX。 MMX是MultiMedia eXtensions(多媒体扩展)的缩写,是
作者:zyl910 更多详情见——http://www.cnblogs.com/zyl910/archive/2012/04/26/md00.htmlSIMD函数整理:00 索引贴 R:寄存器。M:64位MM寄存器;X:128位XMM寄存器;Y:256位YMM寄存器。Name:函数名。Name2:另一种函数名。功能:功能描述。Asm:汇编指令。PCode:伪代码。
如果没有类型转换对应的汇编指令,我们可以用浮点加法来完成整型和浮点类型的互转。 如果有对应的汇编指令,调用之明显更简单高效,然而有些情况下类型转换会溢出,这要注意,计算机不会检查溢出而返回错误结果。一般来说,一次整型和浮点类型的互转的消耗和浮点加法一样,实际就是用浮点加法实现的类型转换。代码如下(不解释): typeconvert.h #include <stdint.h> //以下所有实现经过了严格测试,可以保证无误。其中,标准实现不会溢出,而用指令转换可能溢出。
"C_Cpp.codeAnalysis.clangTidy.args": [ /* unreasonable question(因为我认为,分析结果需要和编译结果匹配) * 该文件中 存在类型转换 (__v4si)(__m128i)__a * typedef __attribute((vector_size(16))) int __v4si * typedef __attribute((vector_size(16))) long long __m128i * 但是 Tidy .
__m64 数据类型可用于 MMX 和3DNow! 内部函数和定义在中 <xmmintrin.h> 。 // data_types__m64.cpp #include <xmmintrin.h> int main() __m64 x; 不应 __m64 直接访问字段。 但是,可在调试器中查看这些类型。 类型为的变量 __m64 映射到 MM [0-7] 寄存器。 _M64 类型的变量会在8
作者:zyl910。   现在很多编译器支持intrinsic函数,这给编写SSE等SIMD代码带来了方便。但是各个编译器略有差异,于是我编写了zintrin.h,智能引入intrinsic函数。 一、各种编译器的区别 1.1 Visual ...
当linux工程项目中出现__m128i 等未定义,它涉及到头文件emmintrin.h对指令集SSE等的加入。 -msse, -msse2, -msse3, -mmmx, -m3dnow 这些标记启用了x86和x86-64构架的SSE、SSE2、SSE3、MMX和3DNow!指令集。他们主要用于多媒体,游戏,及其他浮点运算密集的任务,虽然也包括了一些其他的数学增强指令。比较新的CPU都具有这
重要参考:http://clang.llvm.org/doxygen/emmintrin_8h.html#a907348c4473fe447912561977233aa96 emmintrin.h: This file is part of GCC.