/*** * arm64_neon.h - declarations/definitions for ARM64 NEON specific intrinsics * * Copyright (c) Microsoft Corporation. All rights reserved. * *Purpose: * This include file contains the declarations for ARM64 NEON intrinsic functions * ****/ #pragma once //#include #include #if !defined (_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) #error This header is specific to ARM64 targets #endif /* !defined (_M_ARM64) */ #if defined (__cplusplus) extern "C" { #endif /* defined (__cplusplus) */ /////////////////////////////////////////////////////////////////////////////// // #if !defined (_ADVSIMD_ALIGN) #if defined (__midl) #define _ADVSIMD_ALIGN(x) #else /* defined (__midl) */ #define _ADVSIMD_ALIGN(x) __declspec(align(x)) #endif /* defined (__midl) */ #endif /* !defined (_ADVSIMD_ALIGN) */ #ifndef DUMMYNEONSTRUCT #define DUMMYNEONSTRUCT s #endif /* DUMMYNEONSTRUCT */ /////////////////////////////////////////////////////////////////////////////// // // ARM64 Advanced SIMD 32bit type // typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(4) __n32 { unsigned __int32 n32_u32[1]; unsigned __int16 n32_u16[2]; unsigned __int8 n32_u8[4]; __int32 n32_i32[1]; __int16 n32_i16[2]; __int8 n32_i8[4]; float n32_f32[1]; } __n32; /////////////////////////////////////////////////////////////////////////////// // // ARM64 Advanced SIMD 16bit type // typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(2) __n16 { unsigned __int16 n16_u16[1]; unsigned __int8 n16_u8[2]; __int16 n16_i16[1]; __int8 n16_i8[2]; } __n16; /////////////////////////////////////////////////////////////////////////////// // // ARM64 Advanced SIMD 8bit type // typedef union __declspec(intrin_type) __n8 { unsigned __int8 n8_u8[1]; __int8 n8_i8[1]; } __n8; /////////////////////////////////////////////////////////////////////////////// // // ARM64 Advanced SIMD 64bit type // typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __n64 { unsigned __int64 n64_u64[1]; unsigned __int32 n64_u32[2]; unsigned __int16 n64_u16[4]; unsigned __int8 n64_u8[8]; __int64 n64_i64[1]; __int32 n64_i32[2]; __int16 n64_i16[4]; __int8 n64_i8[8]; float n64_f32[2]; double n64_f64[1]; } __n64; /////////////////////////////////////////////////////////////////////////////// // // ARM64 Advanced SIMD 128bit type // typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __n128 { unsigned __int64 n128_u64[2]; unsigned __int32 n128_u32[4]; unsigned __int16 n128_u16[8]; unsigned __int8 n128_u8[16]; __int64 n128_i64[2]; __int32 n128_i32[4]; __int16 n128_i16[8]; __int8 n128_i8[16]; float n128_f32[4]; double n128_f64[2]; struct { __n64 low64; __n64 high64; } DUMMYNEONSTRUCT; } __n128; typedef struct __n64x2 { __n64 val[2]; } __n64x2; typedef struct __n64x3 { __n64 val[3]; } __n64x3; typedef struct __n64x4 { __n64 val[4]; } __n64x4; typedef struct __n128x2 { __n128 val[2]; } __n128x2; typedef struct __n128x3 { __n128 val[3]; } __n128x3; typedef struct __n128x4 { __n128 val[4]; } __n128x4; /////////////////////////////////////////////////////////////////////////////// // typedef unsigned __int8 poly8_t; typedef unsigned __int16 poly16_t; typedef float float32_t; /////////////////////////////////////////////////////////////////////////////// // __inline __n64 *__int8ToN64( __int8 *p) { return (__n64 *)p; } __inline __n64 *__int16ToN64( __int16 *p) { return (__n64 *)p; } __inline __n64 *__int32ToN64( __int32 *p) { return (__n64 *)p; } __inline __n64 *__int64ToN64( __int64 *p) { return (__n64 *)p; } __inline __n64 *__uint8ToN64( unsigned __int8 *p) { return (__n64 *)p; } __inline __n64 *__uint16ToN64( unsigned __int16 *p) { return (__n64 *)p; } __inline __n64 *__uint32ToN64( unsigned __int32 *p) { return (__n64 *)p; } __inline __n64 *__uint64ToN64( unsigned __int64 *p) { return (__n64 *)p; } __inline __n64 *__poly8ToN64( poly8_t *p) { return (__n64 *)p; } __inline __n64 *__poly16ToN64( poly16_t *p) { return (__n64 *)p; } __inline __n64 *__float32ToN64( float32_t *p) { return (__n64 *)p; } __inline const __n64 *__int8ToN64_c( const __int8 *p) { return (const __n64 *)p; } __inline const __n64 *__int16ToN64_c( const __int16 *p) { return (const __n64 *)p; } __inline const __n64 *__int32ToN64_c( const __int32 *p) { return (const __n64 *)p; } __inline const __n64 *__int64ToN64_c( const __int64 *p) { return (const __n64 *)p; } __inline const __n64 *__uint8ToN64_c( const unsigned __int8 *p) { return (const __n64 *)p; } __inline const __n64 *__uint16ToN64_c( const unsigned __int16 *p) { return (const __n64 *)p; } __inline const __n64 *__uint32ToN64_c( const unsigned __int32 *p) { return (const __n64 *)p; } __inline const __n64 *__uint64ToN64_c( const unsigned __int64 *p) { return (const __n64 *)p; } __inline const __n64 *__poly8ToN64_c( const poly8_t *p) { return (const __n64 *)p; } __inline const __n64 *__poly16ToN64_c( const poly16_t *p) { return (const __n64 *)p; } __inline const __n64 *__float32ToN64_c( const float32_t *p) { return (const __n64 *)p; } __inline __int32 __int8ToInt32(__int8 i) { return (__int32)i; } __inline __int32 __int16ToInt32(__int16 i) { return (__int32)i; } __inline __int32 __int32ToInt32(__int32 i) { return (__int32)i; } __inline __int64 __int64ToInt64(__int64 i) { return (__int64)i; } __inline __int32 __uint8ToInt32(unsigned __int8 i) { return (__int32)i; } __inline __int32 __uint16ToInt32(unsigned __int16 i) { return (__int32)i; } __inline __int32 __uint32ToInt32(unsigned __int32 i) { return (__int32)i; } __inline __int64 __uint64ToInt64(unsigned __int64 i) { return (__int64)i; } __inline __int32 __poly8ToInt32(poly8_t i) { return (__int32)i; } __inline __int32 __poly16ToInt32(poly16_t i) { return (__int32)i; } /////////////////////////////////////////////////////////////////////////////// // explicit types typedef __n64 float32x2_t; typedef __n64x2 float32x2x2_t; typedef __n64x3 float32x2x3_t; typedef __n64x4 float32x2x4_t; typedef __n64 int8x8_t; typedef __n64x2 int8x8x2_t; typedef __n64x3 int8x8x3_t; typedef __n64x4 int8x8x4_t; typedef __n64 int16x4_t; typedef __n64x2 int16x4x2_t; typedef __n64x3 int16x4x3_t; typedef __n64x4 int16x4x4_t; typedef __n64 int32x2_t; typedef __n64x2 int32x2x2_t; typedef __n64x3 int32x2x3_t; typedef __n64x4 int32x2x4_t; typedef __n64 int64x1_t; typedef __n64x2 int64x1x2_t; typedef __n64x3 int64x1x3_t; typedef __n64x4 int64x1x4_t; typedef __n64 poly8x8_t; typedef __n64x2 poly8x8x2_t; typedef __n64x3 poly8x8x3_t; typedef __n64x4 poly8x8x4_t; typedef __n64 poly16x4_t; typedef __n64x2 poly16x4x2_t; typedef __n64x3 poly16x4x3_t; typedef __n64x4 poly16x4x4_t; typedef __n64 uint8x8_t; typedef __n64x2 uint8x8x2_t; typedef __n64x3 uint8x8x3_t; typedef __n64x4 uint8x8x4_t; typedef __n64 uint16x4_t; typedef __n64x2 uint16x4x2_t; typedef __n64x3 uint16x4x3_t; typedef __n64x4 uint16x4x4_t; typedef __n64 uint32x2_t; typedef __n64x2 uint32x2x2_t; typedef __n64x3 uint32x2x3_t; typedef __n64x4 uint32x2x4_t; typedef __n64 uint64x1_t; typedef __n64x2 uint64x1x2_t; typedef __n64x3 uint64x1x3_t; typedef __n64x4 uint64x1x4_t; typedef __n128 float32x4_t; typedef __n128x2 float32x4x2_t; typedef __n128x3 float32x4x3_t; typedef __n128x4 float32x4x4_t; typedef __n128 int8x16_t; typedef __n128x2 int8x16x2_t; typedef __n128x3 int8x16x3_t; typedef __n128x4 int8x16x4_t; typedef __n128 int16x8_t; typedef __n128x2 int16x8x2_t; typedef __n128x3 int16x8x3_t; typedef __n128x4 int16x8x4_t; typedef __n128 int32x4_t; typedef __n128x2 int32x4x2_t; typedef __n128x3 int32x4x3_t; typedef __n128x4 int32x4x4_t; typedef __n128 int64x2_t; typedef __n128x2 int64x2x2_t; typedef __n128x3 int64x2x3_t; typedef __n128x4 int64x2x4_t; typedef __n128 poly8x16_t; typedef __n128x2 poly8x16x2_t; typedef __n128x3 poly8x16x3_t; typedef __n128x4 poly8x16x4_t; typedef __n128 poly16x8_t; typedef __n128x2 poly16x8x2_t; typedef __n128x3 poly16x8x3_t; typedef __n128x4 poly16x8x4_t; typedef __n128 uint8x16_t; typedef __n128x2 uint8x16x2_t; typedef __n128x3 uint8x16x3_t; typedef __n128x4 uint8x16x4_t; typedef __n128 uint16x8_t; typedef __n128x2 uint16x8x2_t; typedef __n128x3 uint16x8x3_t; typedef __n128x4 uint16x8x4_t; typedef __n128 uint32x4_t; typedef __n128x2 uint32x4x2_t; typedef __n128x3 uint32x4x3_t; typedef __n128x4 uint32x4x4_t; typedef __n128 uint64x2_t; typedef __n128x2 uint64x2x2_t; typedef __n128x3 uint64x2x3_t; typedef __n128x4 uint64x2x4_t; typedef __n64 float16x4_t; /////////////////////////////////////////////////////////////////////////////// // prototypes // DUP - register (core register to Neon register) __n64 neon_dupr8(__int32); __n64 neon_dupr16(__int32); __n64 neon_dupr32(__int32); __n64 neon_duprf32(float); __n64 neon_dupr64(__int64); __n64 neon_duprf64(double); __n128 neon_dupqr8(__int32); __n128 neon_dupqr16(__int32); __n128 neon_dupqr32(__int32); __n128 neon_dupqr64(__int64); __n128 neon_dupqrf32(float); __n128 neon_dupqrf64(double); #define vdup_n_f32(reg) neon_duprf32(reg) #define vdup_n_p16(reg) neon_dupr16(reg) #define vdup_n_p8(reg) neon_dupr8(reg) #define vdup_n_s16(reg) neon_dupr16(reg) #define vdup_n_s32(reg) neon_dupr32(reg) #define vdup_n_s64(reg) neon_dupr64(reg) #define vdup_n_s8(reg) neon_dupr8(reg) #define vdup_n_u16(reg) neon_dupr16(reg) #define vdup_n_u32(reg) neon_dupr32(reg) #define vdup_n_u64(reg) neon_dupr64(reg) #define vdup_n_u8(reg) neon_dupr8(reg) #define vdupq_n_f32(reg) neon_dupqrf32(reg) #define vdupq_n_p16(reg) neon_dupqr16(reg) #define vdupq_n_p8(reg) neon_dupqr8(reg) #define vdupq_n_s16(reg) neon_dupqr16(reg) #define vdupq_n_s32(reg) neon_dupqr32(reg) #define vdupq_n_s64(reg) neon_dupqr64(reg) #define vdupq_n_s8(reg) neon_dupqr8(reg) #define vdupq_n_u16(reg) neon_dupqr16(reg) #define vdupq_n_u32(reg) neon_dupqr32(reg) #define vdupq_n_u64(reg) neon_dupqr64(reg) #define vdupq_n_u8(reg) neon_dupqr8(reg) #define vmov_n_f32(reg) neon_duprf32(reg) #define vmov_n_p16(reg) neon_dupr16(reg) #define vmov_n_p8(reg) neon_dupr8(reg) #define vmov_n_s16(reg) neon_dupr16(reg) #define vmov_n_s32(reg) neon_dupr32(reg) #define vmov_n_s64(reg) neon_dupr64(reg) #define vmov_n_s8(reg) neon_dupr8(reg) #define vmov_n_u16(reg) neon_dupr16(reg) #define vmov_n_u32(reg) neon_dupr32(reg) #define vmov_n_u64(reg) neon_dupr64(reg) #define vmov_n_u8(reg) neon_dupr8(reg) #define vmovq_n_f32(reg) neon_dupqrf32(reg) #define vmovq_n_p16(reg) neon_dupqr16(reg) #define vmovq_n_p8(reg) neon_dupqr8(reg) #define vmovq_n_s16(reg) neon_dupqr16(reg) #define vmovq_n_s32(reg) neon_dupqr32(reg) #define vmovq_n_s64(reg) neon_dupqr64(reg) #define vmovq_n_s8(reg) neon_dupqr8(reg) #define vmovq_n_u16(reg) neon_dupqr16(reg) #define vmovq_n_u32(reg) neon_dupqr32(reg) #define vmovq_n_u64(reg) neon_dupqr64(reg) #define vmovq_n_u8(reg) neon_dupqr8(reg) // DUP - element (vector element into vector) __n64 neon_dupe8(__n64, const __int32); __n64 neon_dupe16(__n64, const __int32); __n64 neon_dupe32(__n64, const __int32); __n64 neon_dupe8q(__n128, const __int32); __n64 neon_dupe16q(__n128, const __int32); __n64 neon_dupe32q(__n128, const __int32); __n128 neon_dupqe8(__n64, const __int32); __n128 neon_dupqe16(__n64, const __int32); __n128 neon_dupqe32(__n64, const __int32); __n128 neon_dupqe64(__n64, const __int32); __n128 neon_dupqe8q(__n128, const __int32); __n128 neon_dupqe16q(__n128, const __int32); __n128 neon_dupqe32q(__n128, const __int32); __n128 neon_dupqe64q(__n128, const __int32); #define vdup_lane_f32(reg, lane) neon_dupe32(reg, lane) #define vdup_lane_p16(reg, lane) neon_dupe16(reg, lane) #define vdup_lane_p8(reg, lane) neon_dupe8(reg, lane) #define vdup_lane_s16(reg, lane) neon_dupe16(reg, lane) #define vdup_lane_s32(reg, lane) neon_dupe32(reg, lane) #define vdup_lane_s64(Dn, lane) ( __static_assert(lane == 0, "invalid lane index"), neon_dups64(Dn, lane)) #define vdup_lane_s8(reg, lane) neon_dupe8(reg, lane) #define vdup_lane_u16(reg, lane) neon_dupe16(reg, lane) #define vdup_lane_u32(reg, lane) neon_dupe32(reg, lane) #define vdup_lane_u64(Dn, lane) ( __static_assert(lane == 0, "invalid lane index"), neon_dups64(Dn, lane)) #define vdup_lane_u8(reg, lane) neon_dupe8(reg, lane) #define vdupq_lane_f32(reg, lane) neon_dupqe32(reg, lane) #define vdupq_lane_p16(reg, lane) neon_dupqe16(reg, lane) #define vdupq_lane_p8(reg, lane) neon_dupqe8(reg, lane) #define vdupq_lane_s16(reg, lane) neon_dupqe16(reg, lane) #define vdupq_lane_s32(reg, lane) neon_dupqe32(reg, lane) #define vdupq_lane_s64(reg, lane) neon_dupqe64(reg, lane) #define vdupq_lane_s8(reg, lane) neon_dupqe8(reg, lane) #define vdupq_lane_u16(reg, lane) neon_dupqe16(reg, lane) #define vdupq_lane_u32(reg, lane) neon_dupqe32(reg, lane) #define vdupq_lane_u64(reg, lane) neon_dupqe64(reg, lane) #define vdupq_lane_u8(reg, lane) neon_dupqe8(reg, lane) // DUP - scalar (vector element into scalar) __n8 neon_dups8 (__n64, const __int32); __n16 neon_dups16(__n64, const __int32); float neon_dups32(__n64, const __int32); __n64 neon_dups64(__n64, const __int32); __n8 neon_dups8q (__n128, const __int32); __n16 neon_dups16q(__n128, const __int32); float neon_dups32q(__n128, const __int32); __n64 neon_dups64q(__n128, const __int32); #define movs8(reg, lane) neon_dups8(reg, lane) #define movs16(reg, lane) neon_dups16(reg, lane) #define movs32(reg, lane) neon_dups32(reg, lane) #define movs64(reg, lane) neon_dups64(reg, lane) #define movs8q(reg, lane) neon_dupq8q(reg, lane) #define movs16q(reg, lane) neon_dupq16q(reg, lane) #define movs32q(reg, lane) neon_dups32q(reg, lane) #define movs64q(reg, lane) neon_dups64q(reg, lane) #define vget_lane_f32(Dm, lane) neon_dups32(Dm, lane) #define vgetq_lane_f32(Dm, lane) neon_dups32q(Dm, lane) // FMOV - to/from general, top half of 128 bits // The only two forms are these: // FMOV .D[1], // FMOV , .D[1] __n128 fmov_top_half_core(__n128, __int64); __int64 fmov_core_top_half(__n128); // FMOV - immediate __n64 neon_fmovi2s(const float); __n128 neon_fmovi4s(const float); __n128 neon_fmovi2d(const float); // MOVI, MVNI __n64 neon_movidw(const __int64); // bytemask one doubleword __n128 neon_moviqdw(const __int64); // bytemask per doubleword __n64 neon_movib(const int); // per byte __n128 neon_moviqb(const int); // per byte __n64 neon_movi_shift1w(const int, const int); // shift ones per word __n128 neon_moviq_shift1w(const int, const int); // shift ones per word __n64 neon_movih(const int); // per halfword __n128 neon_moviqh(const int); // per halfword __n64 neon_movi_shift0h(const int, const int); // shift zeroes per halfword __n128 neon_moviq_shift0h(const int, const int); // shift zeroes per halfword __n64 neon_moviw(const int); // per word __n128 neon_moviqw(const int); // per word __n64 neon_movi_shift0w(const int, const int); // shift zeroes per word __n128 neon_moviq_shift0w(const int, const int); // shift zeroes per word __n64 neon_mvni_shift1w(const int, const int); // shift ones per word __n128 neon_mvniq_shift1w(const int, const int); // shift ones per word __n64 neon_mvnih(const int); // per halfword __n128 neon_mvniqh(const int); // per halfword __n64 neon_mvni_shift0h(const int, const int); // shift zeroes per halfword __n128 neon_mvniq_shift0h(const int, const int); // shift zeroes per halfword __n64 neon_mvniw(const int); // per word __n128 neon_mvniqw(const int); // per word __n64 neon_mvni_shift0w(const int, const int); // shift zeroes per word __n128 neon_mvniq_shift0w(const int, const int); // shift zeroes per word // SMOV/UMOV - (move scalar into core) __int8 neon_smov8 (__n64, const __int32); __int8 neon_smovq8 (__n128, const __int32); __int64 neon_smov64_8 (__n64, const __int32); __int64 neon_smov64_q8 (__n128, const __int32); __int16 neon_smov16 (__n64, const __int32); __int16 neon_smovq16 (__n128, const __int32); __int64 neon_smov64_16 (__n64, const __int32); __int64 neon_smov64_q16 (__n128, const __int32); __int64 neon_smov64_32 (__n64, const __int32); __int64 neon_smov64_q32 (__n128, const __int32); unsigned __int8 neon_umov8 (__n64, const __int32); unsigned __int8 neon_umovq8 (__n128, const __int32); unsigned __int16 neon_umov16 (__n64, const __int32); unsigned __int16 neon_umovq16 (__n128, const __int32); unsigned __int32 neon_umov32 (__n64, const __int32); unsigned __int32 neon_umovq32 (__n128, const __int32); unsigned __int64 neon_umov64 (__n64, const __int32); unsigned __int64 neon_umovq64 (__n128, const __int32); #define vget_lane_p8(Dm, lane) neon_smov8(Dm, lane) #define vget_lane_s8(Dm, lane) neon_smov8(Dm, lane) #define vget_lane_u8(Dm, lane) neon_umov8(Dm, lane) #define vget_lane_p16(Dm, lane) neon_smov16(Dm, lane) #define vget_lane_s16(Dm, lane) neon_smov16(Dm, lane) #define vget_lane_u16(Dm, lane) neon_umov16(Dm, lane) #define vget_lane_s32(Dm, lane) neon_umov32(Dm, lane) // there's no smov32 into 32bit core reg (only into 64bit core reg) // umov32 is equivalent though because src/dst type size is the same #define vget_lane_s64(Dm, lane) neon_umov64(Dm, lane) // there's no smov64, umov64 is equivalent though because src/dst type size is the same #define vget_lane_u32(Dm, lane) neon_umov32(Dm, lane) #define vget_lane_u64(Dm, lane) neon_umov64(Dm, lane) #define vgetq_lane_p8(Dm, lane) neon_smovq8(Dm, lane) #define vgetq_lane_s8(Dm, lane) neon_smovq8(Dm, lane) #define vgetq_lane_u8(Dm, lane) neon_umovq8(Dm, lane) #define vgetq_lane_p16(Dm, lane) neon_smovq16(Dm, lane) #define vgetq_lane_s16(Dm, lane) neon_smovq16(Dm, lane) #define vgetq_lane_u16(Dm, lane) neon_umovq16(Dm, lane) #define vgetq_lane_s32(Dm, lane) neon_umovq32(Dm, lane) // there's no smov32 into 32bit core reg (only into 64bit core reg) // umov32 is equivalent though because src/dst type size is the same #define vgetq_lane_s64(Dm, lane) neon_umovq64(Dm, lane) // there's no smov64, umov64 is equivalent though because src/dst type size is the same #define vgetq_lane_u32(Dm, lane) neon_umovq32(Dm, lane) #define vgetq_lane_u64(Dm, lane) neon_umovq64(Dm, lane) // INS register __n64 neon_insr8 (__n64, const __int32, __int32); __n64 neon_insr16 (__n64, const __int32, __int32); __n64 neon_insr32 (__n64, const __int32, __int32); __n64 neon_insr64 (__n64, const __int32, __int64); __n64 neon_insrf32 (__n64, const __int32, float); __n64 neon_insrf64 (__n64, const __int32, double); __n128 neon_insqr8 (__n128, const __int32, __int32); __n128 neon_insqr16 (__n128, const __int32, __int32); __n128 neon_insqr32 (__n128, const __int32, __int32); __n128 neon_insqr64 (__n128, const __int32, __int64); __n128 neon_insqrf32(__n128, const __int32, float); __n128 neon_insqrf64(__n128, const __int32, double); #define movr8(opeqneonreg, lane, corereg) neon_insr8(opeqneonreg, lane, corereg) #define movr16(opeqneonreg, lane, corereg) neon_insr16(opeqneonreg, lane, corereg) #define movr32(opeqneonreg, lane, corereg) neon_insr32(opeqneonreg, lane, corereg) #define movr64(opeqneonreg, lane, corereg) neon_insr64(opeqneonreg, lane, corereg) #define movrf32(opeqneonreg, lane, corereg) neon_insrf32(opeqneonreg, lane, corereg) #define movrf64(opeqneonreg, lane, corereg) neon_insrf64(opeqneonreg, lane, corereg) #define movqr8(opeqneonreg, lane, corereg) neon_insqr8(opeqneonreg, lane, corereg) #define movqr16(opeqneonreg, lane, corereg) neon_insqr16(opeqneonreg, lane, corereg) #define movqr32(opeqneonreg, lane, corereg) neon_insqr32(opeqneonreg, lane, corereg) #define movqr64(opeqneonreg, lane, corereg) neon_insqr64(opeqneonreg, lane, corereg) #define movqrf32(opeqneonreg, lane, corereg) neon_insqrf32(opeqneonreg, lane, corereg) #define movqrf64(opeqneonreg, lane, corereg) neon_insqrf64(opeqneonreg, lane, corereg) #define vset_lane_f32(corereg, opeqneonreg, lane) neon_insrf32(opeqneonreg, lane, corereg) #define vset_lane_f64(corereg, opeqneonreg, lane) neon_insrf64(opeqneonreg, lane, corereg) #define vset_lane_p16(corereg, opeqneonreg, lane) neon_insr16(opeqneonreg, lane, corereg) #define vset_lane_p8(corereg, opeqneonreg, lane) neon_insr8(opeqneonreg, lane, corereg) #define vset_lane_s16(corereg, opeqneonreg, lane) neon_insr16(opeqneonreg, lane, corereg) #define vset_lane_s32(corereg, opeqneonreg, lane) neon_insr32(opeqneonreg, lane, corereg) #define vset_lane_s64(corereg, opeqneonreg, lane) neon_insr64(opeqneonreg, lane, corereg) #define vset_lane_s8(corereg, opeqneonreg, lane) neon_insr8(opeqneonreg, lane, corereg) #define vset_lane_u16(corereg, opeqneonreg, lane) neon_insr16(opeqneonreg, lane, corereg) #define vset_lane_u32(corereg, opeqneonreg, lane) neon_insr32(opeqneonreg, lane, corereg) #define vset_lane_u64(corereg, opeqneonreg, lane) neon_insr64(opeqneonreg, lane, corereg) #define vset_lane_u8(corereg, opeqneonreg, lane) neon_insr8(opeqneonreg, lane, corereg) #define vsetq_lane_f32(corereg, opeqneonreg, lane) neon_insqrf32(opeqneonreg, lane, corereg) #define vsetq_lane_f64(corereg, opeqneonreg, lane) neon_insqrf64(opeqneonreg, lane, corereg) #define vsetq_lane_p16(corereg, opeqneonreg, lane) neon_insqr16(opeqneonreg, lane, corereg) #define vsetq_lane_p8(corereg, opeqneonreg, lane) neon_insqr8(opeqneonreg, lane, corereg) #define vsetq_lane_s16(corereg, opeqneonreg, lane) neon_insqr16(opeqneonreg, lane, corereg) #define vsetq_lane_s32(corereg, opeqneonreg, lane) neon_insqr32(opeqneonreg, lane, corereg) #define vsetq_lane_s64(corereg, opeqneonreg, lane) neon_insqr64(opeqneonreg, lane, corereg) #define vsetq_lane_s8(corereg, opeqneonreg, lane) neon_insqr8(opeqneonreg, lane, corereg) #define vsetq_lane_u16(corereg, opeqneonreg, lane) neon_insqr16(opeqneonreg, lane, corereg) #define vsetq_lane_u32(corereg, opeqneonreg, lane) neon_insqr32(opeqneonreg, lane, corereg) #define vsetq_lane_u64(corereg, opeqneonreg, lane) neon_insqr64(opeqneonreg, lane, corereg) #define vsetq_lane_u8(corereg, opeqneonreg, lane) neon_insqr8(opeqneonreg, lane, corereg) // INS element __n64 neon_inse8 (__n64, const __int32, __n64, const __int32); __n128 neon_insqe8q (__n128, const __int32, __n128, const __int32); __n64 neon_inse16 (__n64, const __int32, __n64, const __int32); __n128 neon_insqe16q (__n128, const __int32, __n128, const __int32); __n64 neon_inse32 (__n64, const __int32, __n64, const __int32); __n128 neon_insqe32q (__n128, const __int32, __n128, const __int32); __n64 neon_inse64 (__n64, const __int32, __n64, const __int32); __n128 neon_insqe64q (__n128, const __int32, __n128, const __int32); #define move8(opeqneonreg, laneDst, neonSrc, laneSrc) neon_inse8(opeqneoneg, laneDst, neonSrc, laneSrc) #define movqe8(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe8(opeqneoneg, laneDst, neonSrc, laneSrc) #define move8q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_inse8q(opeqneoneg, laneDst, neonSrc, laneSrc) #define movqe8q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe8q(opeqneoneg, laneDst, neonSrc, laneSrc) #define move16(opeqneonreg, laneDst, neonSrc, laneSrc) neon_inse16(opeqneoneg, laneDst, neonSrc, laneSrc) #define movqe16(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe16(opeqneoneg, laneDst, neonSrc, laneSrc) #define move16q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_inse16q(opeqneoneg, laneDst, neonSrc, laneSrc) #define movqe16q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe16q(opeqneoneg, laneDst, neonSrc, laneSrc) #define move32(opeqneonreg, laneDst, neonSrc, laneSrc) neon_inse32(opeqneoneg, laneDst, neonSrc, laneSrc) #define movqe32(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe32(opeqneoneg, laneDst, neonSrc, laneSrc) #define move32q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_inse32q(opeqneoneg, laneDst, neonSrc, laneSrc) #define movqe32q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe32q(opeqneoneg, laneDst, neonSrc, laneSrc) #define move64(opeqneonreg, laneDst, neonSrc, laneSrc) neon_inse64(opeqneoneg, laneDst, neonSrc, laneSrc) #define movqe64(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe64(opeqneoneg, laneDst, neonSrc, laneSrc) #define move64q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_inse64q(opeqneoneg, laneDst, neonSrc, laneSrc) #define movqe64q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe64q(opeqneoneg, laneDst, neonSrc, laneSrc) // NOT, MVN __n64 neon_not (__n64); __n128 neon_notq (__n128); #define mvn(src) neon_not(src) #define mvnq(src) neon_notq(src) #define vmvn_p16(reg) neon_not(reg) #define vmvn_p8(reg) neon_not(reg) #define vmvn_s16(reg) neon_not(reg) #define vmvn_s32(reg) neon_not(reg) #define vmvn_s8(reg) neon_not(reg) #define vmvn_u16(reg) neon_not(reg) #define vmvn_u32(reg) neon_not(reg) #define vmvn_u8(reg) neon_not(reg) #define vmvnq_p16(reg) neon_notq(reg) #define vmvnq_p8(reg) neon_notq(reg) #define vmvnq_s16(reg) neon_notq(reg) #define vmvnq_s32(reg) neon_notq(reg) #define vmvnq_s8(reg) neon_notq(reg) #define vmvnq_u16(reg) neon_notq(reg) #define vmvnq_u32(reg) neon_notq(reg) #define vmvnq_u8(reg) neon_notq(reg) // FNEG/NEG/SQNEG __n64 neon_fneg32(__n64); __n128 neon_fnegq32(__n128); __n128 neon_fnegq64(__n128); __n64 neon_neg8(__n64); __n128 neon_negq8(__n128); __n64 neon_neg16(__n64); __n128 neon_negq16(__n128); __n64 neon_neg32(__n64); __n128 neon_negq32(__n128); __n128 neon_negq64(__n128); __n64 neon_sqneg8(__n64); __n128 neon_sqnegq8(__n128); __n64 neon_sqneg16(__n64); __n128 neon_sqnegq16(__n128); __n64 neon_sqneg32(__n64); __n128 neon_sqnegq32(__n128); __n128 neon_sqnegq64(__n128); __n8 neon_sqnegs8(__n8); __n16 neon_sqnegs16(__n16); float neon_sqnegs32(float); __n64 neon_sqnegs64(__n64); __n64 neon_negs64(__n64); #define vneg_f32(reg) neon_fneg32(reg) #define vnegq_f32(reg) neon_fnegq32(reg) #define vneg_s8(reg) neon_neg8(reg) #define vnegq_s8(reg) neon_negq8(reg) #define vqneg_s8(reg) neon_sqneg8(reg) #define vqnegq_s8(reg) neon_sqnegq8(reg) #define vneg_s16(reg) neon_neg16(reg) #define vnegq_s16(reg) neon_negq16(reg) #define vqneg_s16(reg) neon_sqneg16(reg) #define vqnegq_s16(reg) neon_sqnegq16(reg) #define vneg_s32(reg) neon_neg32(reg) #define vnegq_s32(reg) neon_negq32(reg) #define vqneg_s32(reg) neon_sqneg32(reg) #define vqnegq_s32(reg) neon_sqnegq32(reg) // FABS/ABS/SQABS __n64 neon_fabs32(__n64); __n128 neon_fabsq32(__n128); __n128 neon_fabsq64(__n128); __n64 neon_abs8(__n64); __n128 neon_absq8(__n128); __n64 neon_abs16(__n64); __n128 neon_absq16(__n128); __n64 neon_abs32(__n64); __n128 neon_absq32(__n128); __n128 neon_absq64(__n128); __n64 neon_sqabs8(__n64); __n128 neon_sqabsq8(__n128); __n64 neon_sqabs16(__n64); __n128 neon_sqabsq16(__n128); __n64 neon_sqabs32(__n64); __n128 neon_sqabsq32(__n128); __n128 neon_sqabsq64(__n128); __n8 neon_sqabss8(__n8); __n16 neon_sqabss16(__n16); float neon_sqabss32(float); __n64 neon_sqabss64(__n64); __n64 neon_abss64(__n64); #define vabs_f32(reg) neon_fabs32(reg) #define vabsq_f32(reg) neon_fabsq32(reg) #define vabs_s8(reg) neon_abs8(reg) #define vabsq_s8(reg) neon_absq8(reg) #define vqabs_s8(reg) neon_sqabs8(reg) #define vqabsq_s8(reg) neon_sqabsq8(reg) #define vabs_s16(reg) neon_abs16(reg) #define vabsq_s16(reg) neon_absq16(reg) #define vqabs_s16(reg) neon_sqabs16(reg) #define vqabsq_s16(reg) neon_sqabsq16(reg) #define vabs_s32(reg) neon_abs32(reg) #define vabsq_s32(reg) neon_absq32(reg) #define vqabs_s32(reg) neon_sqabs32(reg) #define vqabsq_s32(reg) neon_sqabsq32(reg) // ADD, FADD, SQADD, UQADD, SUQADD, USQADD __n64 neon_fadd32(__n64, __n64); __n128 neon_faddq32(__n128, __n128); __n128 neon_faddq64(__n128, __n128); __n64 neon_add8(__n64, __n64); __n128 neon_addq8(__n128, __n128); __n64 neon_add16(__n64, __n64); __n128 neon_addq16(__n128, __n128); __n64 neon_add32(__n64, __n64); __n128 neon_addq32(__n128, __n128); __n128 neon_addq64(__n128, __n128); __n64 neon_sqadd8(__n64, __n64); __n128 neon_sqaddq8(__n128, __n128); __n64 neon_sqadd16(__n64, __n64); __n128 neon_sqaddq16(__n128, __n128); __n64 neon_sqadd32(__n64, __n64); __n128 neon_sqaddq32(__n128, __n128); __n128 neon_sqaddq64(__n128, __n128); __n64 neon_uqadd8(__n64, __n64); __n128 neon_uqaddq8(__n128, __n128); __n64 neon_uqadd16(__n64, __n64); __n128 neon_uqaddq16(__n128, __n128); __n64 neon_uqadd32(__n64, __n64); __n128 neon_uqaddq32(__n128, __n128); __n128 neon_uqaddq64(__n128, __n128); __n64 neon_suqadd8(__n64, __n64); __n128 neon_suqaddq8(__n128, __n128); __n64 neon_suqadd16(__n64, __n64); __n128 neon_suqaddq16(__n128, __n128); __n64 neon_suqadd32(__n64, __n64); __n128 neon_suqaddq32(__n128, __n128); __n128 neon_suqaddq64(__n128, __n128); __n64 neon_usqadd8(__n64, __n64); __n128 neon_usqaddq8(__n128, __n128); __n64 neon_usqadd16(__n64, __n64); __n128 neon_usqaddq16(__n128, __n128); __n64 neon_usqadd32(__n64, __n64); __n128 neon_usqaddq32(__n128, __n128); __n128 neon_usqaddq64(__n128, __n128); __n64 neon_adds64(__n64, __n64); __n64 neon_sqadds64(__n64, __n64); float neon_sqadds32(float, float); __n16 neon_sqadds16(__n16, __n16); __n8 neon_sqadds8(__n8, __n8); __n64 neon_uqadds64(__n64, __n64); float neon_uqadds32(float, float); __n16 neon_uqadds16(__n16, __n16); __n8 neon_uqadds8(__n8, __n8); __n8 neon_suqadds8(__n8, __n8); __n16 neon_suqadds16(__n16, __n16); float neon_suqadds32(float, float); __n64 neon_suqadds64(__n64, __n64); __n8 neon_usqadds8(__n8, __n8); __n16 neon_usqadds16(__n16, __n16); float neon_usqadds32(float, float); __n64 neon_usqadds64(__n64, __n64); #define vadd_s8(src1, src2) neon_add8(src1, src2) #define vadd_u8(src1, src2) neon_add8(src1, src2) #define vadd_s16(src1, src2) neon_add16(src1, src2) #define vadd_u16(src1, src2) neon_add16(src1, src2) #define vadd_s32(src1, src2) neon_add32(src1, src2) #define vadd_u32(src1, src2) neon_add32(src1, src2) #define vadd_f32(src1, src2) neon_fadd32(src1, src2) #define vadd_s64(src1, src2) neon_adds64(src1, src2) #define vadd_u64(src1, src2) neon_adds64(src1, src2) #define vaddq_s8(src1, src2) neon_addq8(src1, src2) #define vaddq_u8(src1, src2) neon_addq8(src1, src2) #define vaddq_s16(src1, src2) neon_addq16(src1, src2) #define vaddq_u16(src1, src2) neon_addq16(src1, src2) #define vaddq_s32(src1, src2) neon_addq32(src1, src2) #define vaddq_u32(src1, src2) neon_addq32(src1, src2) #define vaddq_f32(src1, src2) neon_faddq32(src1, src2) #define vaddq_s64(src1, src2) neon_addq64(src1, src2) #define vaddq_u64(src1, src2) neon_addq64(src1, src2) #define vqadd_s8(src1, src2) neon_sqadd8(src1, src2) #define vqadd_u8(src1, src2) neon_uqadd8(src1, src2) #define vqadd_s16(src1, src2) neon_sqadd16(src1, src2) #define vqadd_u16(src1, src2) neon_uqadd16(src1, src2) #define vqadd_s32(src1, src2) neon_sqadd32(src1, src2) #define vqadd_u32(src1, src2) neon_uqadd32(src1, src2) #define vqadd_s64(src1, src2) neon_sqadds64(src1, src2) #define vqadd_u64(src1, src2) neon_uqadds64(src1, src2) #define vqaddq_s8(src1, src2) neon_sqaddq8(src1, src2) #define vqaddq_u8(src1, src2) neon_uqaddq8(src1, src2) #define vqaddq_s16(src1, src2) neon_sqaddq16(src1, src2) #define vqaddq_u16(src1, src2) neon_uqaddq16(src1, src2) #define vqaddq_s32(src1, src2) neon_sqaddq32(src1, src2) #define vqaddq_u32(src1, src2) neon_uqaddq32(src1, src2) #define vqaddq_s64(src1, src2) neon_sqaddq64(src1, src2) #define vqaddq_u64(src1, src2) neon_uqaddq64(src1, src2) // SUB, FSUB, SQSUB, UQSUB __n64 neon_fsub32(__n64, __n64); __n128 neon_fsubq32(__n128, __n128); __n128 neon_fsubq64(__n128, __n128); __n64 neon_sub8(__n64, __n64); __n128 neon_subq8(__n128, __n128); __n64 neon_sub16(__n64, __n64); __n128 neon_subq16(__n128, __n128); __n64 neon_sub32(__n64, __n64); __n128 neon_subq32(__n128, __n128); __n128 neon_subq64(__n128, __n128); __n64 neon_sqsub8(__n64, __n64); __n128 neon_sqsubq8(__n128, __n128); __n64 neon_sqsub16(__n64, __n64); __n128 neon_sqsubq16(__n128, __n128); __n64 neon_sqsub32(__n64, __n64); __n128 neon_sqsubq32(__n128, __n128); __n128 neon_sqsubq64(__n128, __n128); __n64 neon_uqsub8(__n64, __n64); __n128 neon_uqsubq8(__n128, __n128); __n64 neon_uqsub16(__n64, __n64); __n128 neon_uqsubq16(__n128, __n128); __n64 neon_uqsub32(__n64, __n64); __n128 neon_uqsubq32(__n128, __n128); __n128 neon_uqsubq64(__n128, __n128); __n64 neon_subs64(__n64, __n64); __n64 neon_sqsubs64(__n64, __n64); float neon_sqsubs32(float, float); __n16 neon_sqsubs16(__n16, __n16); __n8 neon_sqsubs8(__n8, __n8); __n64 neon_uqsubs64(__n64, __n64); float neon_uqsubs32(float, float); __n16 neon_uqsubs16(__n16, __n16); __n8 neon_uqsubs8(__n8, __n8); #define vsub_s8(src1, src2) neon_sub8(src1, src2) #define vsub_u8(src1, src2) neon_sub8(src1, src2) #define vsub_s16(src1, src2) neon_sub16(src1, src2) #define vsub_u16(src1, src2) neon_sub16(src1, src2) #define vsub_s32(src1, src2) neon_sub32(src1, src2) #define vsub_u32(src1, src2) neon_sub32(src1, src2) #define vsub_f32(src1, src2) neon_fsub32(src1, src2) #define vsub_s64(src1, src2) neon_subs64(src1, src2) #define vsub_u64(src1, src2) neon_subs64(src1, src2) #define vsubq_s8(src1, src2) neon_subq8(src1, src2) #define vsubq_u8(src1, src2) neon_subq8(src1, src2) #define vsubq_s16(src1, src2) neon_subq16(src1, src2) #define vsubq_u16(src1, src2) neon_subq16(src1, src2) #define vsubq_s32(src1, src2) neon_subq32(src1, src2) #define vsubq_u32(src1, src2) neon_subq32(src1, src2) #define vsubq_f32(src1, src2) neon_fsubq32(src1, src2) #define vsubq_s64(src1, src2) neon_subq64(src1, src2) #define vsubq_u64(src1, src2) neon_subq64(src1, src2) #define vqsub_s8(src1, src2) neon_sqsub8(src1, src2) #define vqsub_u8(src1, src2) neon_uqsub8(src1, src2) #define vqsub_s16(src1, src2) neon_sqsub16(src1, src2) #define vqsub_u16(src1, src2) neon_uqsub16(src1, src2) #define vqsub_s32(src1, src2) neon_sqsub32(src1, src2) #define vqsub_u32(src1, src2) neon_uqsub32(src1, src2) #define vqsub_s64(src1, src2) neon_sqsubs64(src1, src2) #define vqsub_u64(src1, src2) neon_uqsubs64(src1, src2) #define vqsubq_s8(src1, src2) neon_sqsubq8(src1, src2) #define vqsubq_u8(src1, src2) neon_uqsubq8(src1, src2) #define vqsubq_s16(src1, src2) neon_sqsubq16(src1, src2) #define vqsubq_u16(src1, src2) neon_uqsubq16(src1, src2) #define vqsubq_s32(src1, src2) neon_sqsubq32(src1, src2) #define vqsubq_u32(src1, src2) neon_uqsubq32(src1, src2) #define vqsubq_s64(src1, src2) neon_sqsubq64(src1, src2) #define vqsubq_u64(src1, src2) neon_uqsubq64(src1, src2) // SH(R)ADD, UH(R)ADD and SUB __n64 neon_shadd8(__n64, __n64); __n64 neon_shadd16(__n64, __n64); __n64 neon_shadd32(__n64, __n64); __n128 neon_shaddq8(__n128, __n128); __n128 neon_shaddq16(__n128, __n128); __n128 neon_shaddq32(__n128, __n128); __n64 neon_srhadd8(__n64, __n64); __n64 neon_srhadd16(__n64, __n64); __n64 neon_srhadd32(__n64, __n64); __n128 neon_srhaddq8(__n128, __n128); __n128 neon_srhaddq16(__n128, __n128); __n128 neon_srhaddq32(__n128, __n128); __n64 neon_uhadd8(__n64, __n64); __n64 neon_uhadd16(__n64, __n64); __n64 neon_uhadd32(__n64, __n64); __n128 neon_uhaddq8(__n128, __n128); __n128 neon_uhaddq16(__n128, __n128); __n128 neon_uhaddq32(__n128, __n128); __n64 neon_urhadd8(__n64, __n64); __n64 neon_urhadd16(__n64, __n64); __n64 neon_urhadd32(__n64, __n64); __n128 neon_urhaddq8(__n128, __n128); __n128 neon_urhaddq16(__n128, __n128); __n128 neon_urhaddq32(__n128, __n128); __n64 neon_shsub8(__n64, __n64); __n64 neon_shsub16(__n64, __n64); __n64 neon_shsub32(__n64, __n64); __n128 neon_shsubq8(__n128, __n128); __n128 neon_shsubq16(__n128, __n128); __n128 neon_shsubq32(__n128, __n128); __n64 neon_uhsub8(__n64, __n64); __n64 neon_uhsub16(__n64, __n64); __n64 neon_uhsub32(__n64, __n64); __n128 neon_uhsubq8(__n128, __n128); __n128 neon_uhsubq16(__n128, __n128); __n128 neon_uhsubq32(__n128, __n128); #define vhadd_s8(src1, src2) neon_shadd8(src1, src2) #define vhadd_s16(src1, src2) neon_shadd16(src1, src2) #define vhadd_s32(src1, src2) neon_shadd32(src1, src2) #define vhaddq_s8(src1, src2) neon_shaddq8(src1, src2) #define vhaddq_s16(src1, src2) neon_shaddq16(src1, src2) #define vhaddq_s32(src1, src2) neon_shaddq32(src1, src2) #define vrhadd_s8(src1, src2) neon_srhadd8(src1, src2) #define vrhadd_s16(src1, src2) neon_srhadd16(src1, src2) #define vrhadd_s32(src1, src2) neon_srhadd32(src1, src2) #define vrhaddq_s8(src1, src2) neon_srhaddq8(src1, src2) #define vrhaddq_s16(src1, src2) neon_srhaddq16(src1, src2) #define vrhaddq_s32(src1, src2) neon_srhaddq32(src1, src2) #define vhadd_u8(src1, src2) neon_uhadd8(src1, src2) #define vhadd_u16(src1, src2) neon_uhadd16(src1, src2) #define vhadd_u32(src1, src2) neon_uhadd32(src1, src2) #define vhaddq_u8(src1, src2) neon_uhaddq8(src1, src2) #define vhaddq_u16(src1, src2) neon_uhaddq16(src1, src2) #define vhaddq_u32(src1, src2) neon_uhaddq32(src1, src2) #define vrhadd_u8(src1, src2) neon_urhadd8(src1, src2) #define vrhadd_u16(src1, src2) neon_urhadd16(src1, src2) #define vrhadd_u32(src1, src2) neon_urhadd32(src1, src2) #define vrhaddq_u8(src1, src2) neon_urhaddq8(src1, src2) #define vrhaddq_u16(src1, src2) neon_urhaddq16(src1, src2) #define vrhaddq_u32(src1, src2) neon_urhaddq32(src1, src2) #define vhsub_s8(src1, src2) neon_shsub8(src1, src2) #define vhsub_s16(src1, src2) neon_shsub16(src1, src2) #define vhsub_s32(src1, src2) neon_shsub32(src1, src2) #define vhsubq_s8(src1, src2) neon_shsubq8(src1, src2) #define vhsubq_s16(src1, src2) neon_shsubq16(src1, src2) #define vhsubq_s32(src1, src2) neon_shsubq32(src1, src2) #define vhsub_u8(src1, src2) neon_uhsub8(src1, src2) #define vhsub_u16(src1, src2) neon_uhsub16(src1, src2) #define vhsub_u32(src1, src2) neon_uhsub32(src1, src2) #define vhsubq_u8(src1, src2) neon_uhsubq8(src1, src2) #define vhsubq_u16(src1, src2) neon_uhsubq16(src1, src2) #define vhsubq_u32(src1, src2) neon_uhsubq32(src1, src2) // ADDP/FADDP __n64 neon_addp8 (__n64, __n64); __n64 neon_addp16 (__n64, __n64); __n64 neon_addp32 (__n64, __n64); __n64 neon_addps64(__n64); __n128 neon_addpq8 (__n128, __n128); __n128 neon_addpq16(__n128, __n128); __n128 neon_addpq32(__n128, __n128); __n128 neon_addpq64(__n128, __n128); __n64 neon_faddp32(__n64, __n64); float neon_faddps32(__n64); __n128 neon_faddpq32 (__n128, __n128); __n128 neon_faddpq64 (__n128, __n128); __n64 neon_faddpsq64(__n128); #define vpadd_s8(src1, src2) neon_addp8(src1, src2) #define vpadd_u8(src1, src2) neon_addp8(src1, src2) #define vpadd_s16(src1, src2) neon_addp16(src1, src2) #define vpadd_u16(src1, src2) neon_addp16(src1, src2) #define vpadd_s32(src1, src2) neon_addp32(src1, src2) #define vpadd_u32(src1, src2) neon_addp32(src1, src2) #define vpadd_f32(src1, src2) neon_faddp32(src1, src2) #define vpaddq_f32(src1, src2) neon_faddpq32(src1, src2) // ADDV/SADDLV/UADDLV __n8 neon_addv8(__n64); __n8 neon_addvq8(__n128); __n16 neon_addv16(__n64); __n16 neon_addvq16(__n128); float neon_addvq32(__n128); __n16 neon_saddlv8(__n64); __n16 neon_saddlvq8(__n128); float neon_saddlv16(__n64); float neon_saddlvq16(__n128); __n64 neon_saddlvq32(__n128); __n16 neon_uaddlv8(__n64); __n16 neon_uaddlvq8(__n128); float neon_uaddlv16(__n64); float neon_uaddlvq16(__n128); __n64 neon_uaddlvq32(__n128); // SADALP/UADALP/SADDLP/UADDLP __n64 neon_saddlp8(__n64); __n128 neon_saddlpq8(__n128); __n64 neon_saddlp16(__n64); __n128 neon_saddlpq16(__n128); __n64 neon_saddlp32(__n64); __n128 neon_saddlpq32(__n128); __n64 neon_uaddlp8(__n64); __n128 neon_uaddlpq8(__n128); __n64 neon_uaddlp16(__n64); __n128 neon_uaddlpq16(__n128); __n64 neon_uaddlp32(__n64); __n128 neon_uaddlpq32(__n128); __n64 neon_sadalp8(__n64, __n64); __n128 neon_sadalpq8(__n128, __n128); __n64 neon_sadalp16(__n64, __n64); __n128 neon_sadalpq16(__n128, __n128); __n64 neon_sadalp32(__n64, __n64); __n128 neon_sadalpq32(__n128, __n128); __n64 neon_uadalp8(__n64, __n64); __n128 neon_uadalpq8(__n128, __n128); __n64 neon_uadalp16(__n64, __n64); __n128 neon_uadalpq16(__n128, __n128); __n64 neon_uadalp32(__n64, __n64); __n128 neon_uadalpq32(__n128, __n128); #define vpaddl_s8(src) neon_saddlp8(src) #define vpaddlq_s8(src) neon_saddlpq8(src) #define vpaddl_s16(src) neon_saddlp16(src) #define vpaddlq_s16(src) neon_saddlpq16(src) #define vpaddl_s32(src) neon_saddlp32(src) #define vpaddlq_s32(src) neon_saddlpq32(src) #define vpaddl_u8(src) neon_uaddlp8(src) #define vpaddlq_u8(src) neon_uaddlpq8(src) #define vpaddl_u16(src) neon_uaddlp16(src) #define vpaddlq_u16(src) neon_uaddlpq16(src) #define vpaddl_u32(src) neon_uaddlp32(src) #define vpaddlq_u32(src) neon_uaddlpq32(src) #define vpadal_s8(src1, src2) neon_sadalp8(src1, src2) #define vpadalq_s8(src1, src2) neon_sadalpq8(src1, src2) #define vpadal_s16(src1, src2) neon_sadalp16(src1, src2) #define vpadalq_s16(src1, src2) neon_sadalpq16(src1, src2) #define vpadal_s32(src1, src2) neon_sadalp32(src1, src2) #define vpadalq_s32(src1, src2) neon_sadalpq32(src1, src2) #define vpadal_u8(src1, src2) neon_uadalp8(src1, src2) #define vpadalq_u8(src1, src2) neon_uadalpq8(src1, src2) #define vpadal_u16(src1, src2) neon_uadalp16(src1, src2) #define vpadalq_u16(src1, src2) neon_uadalpq16(src1, src2) #define vpadal_u32(src1, src2) neon_uadalp32(src1, src2) #define vpadalq_u32(src1, src2) neon_uadalpq32(src1, src2) // AESE/AESD/AESMC/AESIMC __n128 neon_aese(__n128, __n128); __n128 neon_aesd(__n128, __n128); __n128 neon_aesmc(__n128); __n128 neon_aesimc(__n128); #define aese_p8(src1, src2) neon_aese(src1, src2) #define aese_s8(src1, src2) neon_aese(src1, src2) #define aese_u8(src1, src2) neon_aese(src1, src2) #define vaeseq_u8(src1, src2) neon_aese(src1, src2) #define aesd_p8(src1, src2) neon_aesd(src1, src2) #define aesd_s8(src1, src2) neon_aesd(src1, src2) #define aesd_u8(src1, src2) neon_aesd(src1, src2) #define vaesdq_u8(src1, src2) neon_aesd(src1, src2) #define aesmc_p8(src) neon_aesmc(src) #define aesmc_s8(src) neon_aesmc(src) #define aesmc_u8(src) neon_aesmc(src) #define vaesmcq_u8(src) neon_aesmc(src) #define aesimc_p8(src) neon_aesimc(src) #define aesimc_s8(src) neon_aesimc(src) #define aesimc_u8(src) neon_aesimc(src) #define vaesimcq_u8(src) neon_aesimc(src) // AND/BIC/BIF/BIT/BSL/EOR/ORN/ORR __n64 neon_and(__n64, __n64); __n128 neon_andq(__n128, __n128); __n64 neon_eor(__n64, __n64); __n128 neon_eorq(__n128, __n128); __n64 neon_orn(__n64, __n64); __n128 neon_ornq(__n128, __n128); __n64 neon_orr(__n64, __n64); __n128 neon_orrq(__n128, __n128); __n64 neon_bic(__n64, __n64); __n128 neon_bicq(__n128, __n128); __n64 neon_bif(__n64, __n64, __n64); __n128 neon_bifq(__n128, __n128, __n128); __n64 neon_bit(__n64, __n64, __n64); __n128 neon_bitq(__n128, __n128, __n128); __n64 neon_bsl(__n64, __n64, __n64); __n128 neon_bslq(__n128, __n128, __n128); #define vand_s8(src1, src2) neon_and(src1, src2) #define vand_u8(src1, src2) neon_and(src1, src2) #define vand_s16(src1, src2) neon_and(src1, src2) #define vand_u16(src1, src2) neon_and(src1, src2) #define vand_s32(src1, src2) neon_and(src1, src2) #define vand_u32(src1, src2) neon_and(src1, src2) #define vand_s64(src1, src2) neon_and(src1, src2) #define vand_u64(src1, src2) neon_and(src1, src2) #define vandq_s8(src1, src2) neon_andq(src1, src2) #define vandq_u8(src1, src2) neon_andq(src1, src2) #define vandq_s16(src1, src2) neon_andq(src1, src2) #define vandq_u16(src1, src2) neon_andq(src1, src2) #define vandq_s32(src1, src2) neon_andq(src1, src2) #define vandq_u32(src1, src2) neon_andq(src1, src2) #define vandq_s64(src1, src2) neon_andq(src1, src2) #define vandq_u64(src1, src2) neon_andq(src1, src2) #define veor_s8(src1, src2) neon_eor(src1, src2) #define veor_u8(src1, src2) neon_eor(src1, src2) #define veor_s16(src1, src2) neon_eor(src1, src2) #define veor_u16(src1, src2) neon_eor(src1, src2) #define veor_s32(src1, src2) neon_eor(src1, src2) #define veor_u32(src1, src2) neon_eor(src1, src2) #define veor_s64(src1, src2) neon_eor(src1, src2) #define veor_u64(src1, src2) neon_eor(src1, src2) #define veorq_s8(src1, src2) neon_eorq(src1, src2) #define veorq_u8(src1, src2) neon_eorq(src1, src2) #define veorq_s16(src1, src2) neon_eorq(src1, src2) #define veorq_u16(src1, src2) neon_eorq(src1, src2) #define veorq_s32(src1, src2) neon_eorq(src1, src2) #define veorq_u32(src1, src2) neon_eorq(src1, src2) #define veorq_s64(src1, src2) neon_eorq(src1, src2) #define veorq_u64(src1, src2) neon_eorq(src1, src2) #define vorr_s8(src1, src2) neon_orr(src1, src2) #define vorr_u8(src1, src2) neon_orr(src1, src2) #define vorr_s16(src1, src2) neon_orr(src1, src2) #define vorr_u16(src1, src2) neon_orr(src1, src2) #define vorr_s32(src1, src2) neon_orr(src1, src2) #define vorr_u32(src1, src2) neon_orr(src1, src2) #define vorr_s64(src1, src2) neon_orr(src1, src2) #define vorr_u64(src1, src2) neon_orr(src1, src2) #define vorrq_s8(src1, src2) neon_orrq(src1, src2) #define vorrq_u8(src1, src2) neon_orrq(src1, src2) #define vorrq_s16(src1, src2) neon_orrq(src1, src2) #define vorrq_u16(src1, src2) neon_orrq(src1, src2) #define vorrq_s32(src1, src2) neon_orrq(src1, src2) #define vorrq_u32(src1, src2) neon_orrq(src1, src2) #define vorrq_s64(src1, src2) neon_orrq(src1, src2) #define vorrq_u64(src1, src2) neon_orrq(src1, src2) #define vorn_s8(src1, src2) neon_orn(src1, src2) #define vorn_u8(src1, src2) neon_orn(src1, src2) #define vorn_s16(src1, src2) neon_orn(src1, src2) #define vorn_u16(src1, src2) neon_orn(src1, src2) #define vorn_s32(src1, src2) neon_orn(src1, src2) #define vorn_u32(src1, src2) neon_orn(src1, src2) #define vorn_s64(src1, src2) neon_orn(src1, src2) #define vorn_u64(src1, src2) neon_orn(src1, src2) #define vornq_s8(src1, src2) neon_ornq(src1, src2) #define vornq_u8(src1, src2) neon_ornq(src1, src2) #define vornq_s16(src1, src2) neon_ornq(src1, src2) #define vornq_u16(src1, src2) neon_ornq(src1, src2) #define vornq_s32(src1, src2) neon_ornq(src1, src2) #define vornq_u32(src1, src2) neon_ornq(src1, src2) #define vornq_s64(src1, src2) neon_ornq(src1, src2) #define vornq_u64(src1, src2) neon_ornq(src1, src2) #define vbic_s8(src1, src2) neon_bic(src1, src2) #define vbic_u8(src1, src2) neon_bic(src1, src2) #define vbic_s16(src1, src2) neon_bic(src1, src2) #define vbic_u16(src1, src2) neon_bic(src1, src2) #define vbic_s32(src1, src2) neon_bic(src1, src2) #define vbic_u32(src1, src2) neon_bic(src1, src2) #define vbic_s64(src1, src2) neon_bic(src1, src2) #define vbic_u64(src1, src2) neon_bic(src1, src2) #define vbicq_s8(src1, src2) neon_bicq(src1, src2) #define vbicq_u8(src1, src2) neon_bicq(src1, src2) #define vbicq_s16(src1, src2) neon_bicq(src1, src2) #define vbicq_u16(src1, src2) neon_bicq(src1, src2) #define vbicq_s32(src1, src2) neon_bicq(src1, src2) #define vbicq_u32(src1, src2) neon_bicq(src1, src2) #define vbicq_s64(src1, src2) neon_bicq(src1, src2) #define vbicq_u64(src1, src2) neon_bicq(src1, src2) #define vbif_s8(src1, src2, src3) neon_bif(src1, src2, src3) #define vbif_u8(src1, src2, src3) neon_bif(src1, src2, src3) #define vbif_s16(src1, src2, src3) neon_bif(src1, src2, src3) #define vbif_u16(src1, src2, src3) neon_bif(src1, src2, src3) #define vbif_s32(src1, src2, src3) neon_bif(src1, src2, src3) #define vbif_u32(src1, src2, src3) neon_bif(src1, src2, src3) #define vbif_s64(src1, src2, src3) neon_bif(src1, src2, src3) #define vbif_u64(src1, src2, src3) neon_bif(src1, src2, src3) #define vbifq_s8(src1, src2, src3) neon_bifq(src1, src2, src3) #define vbifq_u8(src1, src2, src3) neon_bifq(src1, src2, src3) #define vbifq_s16(src1, src2, src3) neon_bifq(src1, src2, src3) #define vbifq_u16(src1, src2, src3) neon_bifq(src1, src2, src3) #define vbifq_s32(src1, src2, src3) neon_bifq(src1, src2, src3) #define vbifq_u32(src1, src2, src3) neon_bifq(src1, src2, src3) #define vbifq_s64(src1, src2, src3) neon_bifq(src1, src2, src3) #define vbifq_u64(src1, src2, src3) neon_bifq(src1, src2, src3) #define vbit_s8(src1, src2, src3) neon_bit(src1, src2, src3) #define vbit_u8(src1, src2, src3) neon_bit(src1, src2, src3) #define vbit_s16(src1, src2, src3) neon_bit(src1, src2, src3) #define vbit_u16(src1, src2, src3) neon_bit(src1, src2, src3) #define vbit_s32(src1, src2, src3) neon_bit(src1, src2, src3) #define vbit_u32(src1, src2, src3) neon_bit(src1, src2, src3) #define vbit_s64(src1, src2, src3) neon_bit(src1, src2, src3) #define vbit_u64(src1, src2, src3) neon_bit(src1, src2, src3) #define vbitq_s8(src1, src2, src3) neon_bitq(src1, src2, src3) #define vbitq_u8(src1, src2, src3) neon_bitq(src1, src2, src3) #define vbitq_s16(src1, src2, src3) neon_bitq(src1, src2, src3) #define vbitq_u16(src1, src2, src3) neon_bitq(src1, src2, src3) #define vbitq_s32(src1, src2, src3) neon_bitq(src1, src2, src3) #define vbitq_u32(src1, src2, src3) neon_bitq(src1, src2, src3) #define vbitq_s64(src1, src2, src3) neon_bitq(src1, src2, src3) #define vbitq_u64(src1, src2, src3) neon_bitq(src1, src2, src3) #define vbsl_s8(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbsl_u8(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbsl_s16(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbsl_u16(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbsl_s32(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbsl_f32(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbsl_u32(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbsl_s64(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbsl_f64(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbsl_u64(src1, src2, src3) neon_bsl(src1, src2, src3) #define vbslq_s8(src1, src2, src3) neon_bslq(src1, src2, src3) #define vbslq_u8(src1, src2, src3) neon_bslq(src1, src2, src3) #define vbslq_s16(src1, src2, src3) neon_bslq(src1, src2, src3) #define vbslq_u16(src1, src2, src3) neon_bslq(src1, src2, src3) #define vbslq_s32(src1, src2, src3) neon_bslq(src1, src2, src3) #define vbslq_f32(src1, src2, src3) neon_bslq(src1, src2, src3) #define vbslq_u32(src1, src2, src3) neon_bslq(src1, src2, src3) #define vbslq_s64(src1, src2, src3) neon_bslq(src1, src2, src3) #define vbslq_u64(src1, src2, src3) neon_bslq(src1, src2, src3) #define vbslq_f64(src1, src2, src3) neon_bslq(src1, src2, src3) // BIC/ORR immediate __n64 neon_bich(__n64, const int); __n64 neon_bicw(__n64, const int); __n64 neon_bic_shifth(__n64, const int, const int); __n64 neon_bic_shiftw(__n64, const int, const int); __n128 neon_bicqh(__n128, const int); __n128 neon_bicqw(__n128, const int); __n128 neon_bicq_shifth(__n128, const int, const int); __n128 neon_bicq_shiftw(__n128, const int, const int); __n64 neon_orrh(__n64, const int); __n64 neon_orrw(__n64, const int); __n64 neon_orr_shifth(__n64, const int, const int); __n64 neon_orr_shiftw(__n64, const int, const int); __n128 neon_orrqh(__n128, const int); __n128 neon_orrqw(__n128, const int); __n128 neon_orrq_shifth(__n128, const int, const int); __n128 neon_orrq_shiftw(__n128, const int, const int); // RBIT/REV16/REV32/REV64 __n64 neon_rbit(__n64); __n128 neon_rbitq(__n128); __n64 neon_rev16(__n64); __n128 neon_rev16q(__n128); __n64 neon_rev32_8(__n64); __n128 neon_rev32q_8(__n128); __n64 neon_rev32_16(__n64); __n128 neon_rev32q_16(__n128); __n64 neon_rev64_8(__n64); __n128 neon_rev64q_8(__n128); __n64 neon_rev64_16(__n64); __n128 neon_rev64q_16(__n128); __n64 neon_rev64_32(__n64); __n128 neon_rev64q_32(__n128); #define vrev16_p8(src) neon_rev16(src) #define vrev16_s8(src) neon_rev16(src) #define vrev16_u8(src) neon_rev16(src) #define vrev32_p8(src) neon_rev32_8(src) #define vrev32_s8(src) neon_rev32_8(src) #define vrev32_u8(src) neon_rev32_8(src) #define vrev32_p16(src) neon_rev32_16(src) #define vrev32_s16(src) neon_rev32_16(src) #define vrev32_u16(src) neon_rev32_16(src) #define vrev64_p8(src) neon_rev64_8(src) #define vrev64_s8(src) neon_rev64_8(src) #define vrev64_u8(src) neon_rev64_8(src) #define vrev64_p16(src) neon_rev64_16(src) #define vrev64_s16(src) neon_rev64_16(src) #define vrev64_u16(src) neon_rev64_16(src) #define vrev64_s32(src) neon_rev64_32(src) #define vrev64_u32(src) neon_rev64_32(src) #define vrev64_f32(src) neon_rev64_32(src) #define vrev16q_p8(src) neon_rev16q(src) #define vrev16q_s8(src) neon_rev16q(src) #define vrev16q_u8(src) neon_rev16q(src) #define vrev32q_p8(src) neon_rev32q_8(src) #define vrev32q_s8(src) neon_rev32q_8(src) #define vrev32q_u8(src) neon_rev32q_8(src) #define vrev32q_p16(src) neon_rev32q_16(src) #define vrev32q_s16(src) neon_rev32q_16(src) #define vrev32q_u16(src) neon_rev32q_16(src) #define vrev64q_p8(src) neon_rev64q_8(src) #define vrev64q_s8(src) neon_rev64q_8(src) #define vrev64q_u8(src) neon_rev64q_8(src) #define vrev64q_p16(src) neon_rev64q_16(src) #define vrev64q_s16(src) neon_rev64q_16(src) #define vrev64q_u16(src) neon_rev64q_16(src) #define vrev64q_s32(src) neon_rev64q_32(src) #define vrev64q_u32(src) neon_rev64q_32(src) #define vrev64q_f32(src) neon_rev64q_32(src) // CNT/CLS/CLZ __n64 neon_cnt(__n64); __n128 neon_cntq(__n128); __n64 neon_cls8(__n64); __n128 neon_clsq8(__n128); __n64 neon_cls16(__n64); __n128 neon_clsq16(__n128); __n64 neon_cls32(__n64); __n128 neon_clsq32(__n128); __n64 neon_clz8(__n64); __n128 neon_clzq8(__n128); __n64 neon_clz16(__n64); __n128 neon_clzq16(__n128); __n64 neon_clz32(__n64); __n128 neon_clzq32(__n128); #define vcnt_p8(src) neon_cnt(src) #define vcnt_s8(src) neon_cnt(src) #define vcnt_u8(src) neon_cnt(src) #define vcntq_p8(src) neon_cntq(src) #define vcntq_s8(src) neon_cntq(src) #define vcntq_u8(src) neon_cntq(src) #define vcls_s8(src) neon_cls8(src) #define vcls_s16(src) neon_cls16(src) #define vcls_s32(src) neon_cls32(src) #define vclsq_s8(src) neon_clsq8(src) #define vclsq_s16(src) neon_clsq16(src) #define vclsq_s32(src) neon_clsq32(src) #define vclz_s8(src) neon_clz8(src) #define vclz_s16(src) neon_clz16(src) #define vclz_s32(src) neon_clz32(src) #define vclz_u8(src) neon_clz8(src) #define vclz_u16(src) neon_clz16(src) #define vclz_u32(src) neon_clz32(src) #define vclzq_s8(src) neon_clzq8(src) #define vclzq_s16(src) neon_clzq16(src) #define vclzq_s32(src) neon_clzq32(src) #define vclzq_u8(src) neon_clzq8(src) #define vclzq_u16(src) neon_clzq16(src) #define vclzq_u32(src) neon_clzq32(src) // FMAX/FMAXNM/FMAXNMP/FMAXNMV/FMAXP/FMAXV/SMAX/SMAXP/SMAXV/UMAX/UMAXP/UMAXV __n64 neon_fmax32(__n64, __n64); __n128 neon_fmaxq32(__n128, __n128); __n128 neon_fmaxq64(__n128, __n128); __n64 neon_fmaxnm32(__n64, __n64); __n128 neon_fmaxnmq32(__n128, __n128); __n128 neon_fmaxnmq64(__n128, __n128); __n64 neon_fmaxnmp32(__n64, __n64); __n128 neon_fmaxnmpq32(__n128, __n128); __n128 neon_fmaxnmpq64(__n128, __n128); float neon_fmaxnmps32(__n128); double neon_fmaxnmps64(__n128); float neon_fmaxnmv(__n128); __n64 neon_fmaxp32(__n64, __n64); __n128 neon_fmaxpq32(__n128, __n128); __n128 neon_fmaxpq64(__n128, __n128); float neon_fmaxps32(__n128); double neon_fmaxps64(__n128); float neon_fmaxv(__n128); __n64 neon_smax8(__n64, __n64); __n64 neon_smax16(__n64, __n64); __n64 neon_smax32(__n64, __n64); __n128 neon_smaxq8(__n128, __n128); __n128 neon_smaxq16(__n128, __n128); __n128 neon_smaxq32(__n128, __n128); __n64 neon_smaxp8(__n64, __n64); __n64 neon_smaxp16(__n64, __n64); __n64 neon_smaxp32(__n64, __n64); __n128 neon_smaxpq8(__n128, __n128); __n128 neon_smaxpq16(__n128, __n128); __n128 neon_smaxpq32(__n128, __n128); __n8 neon_smaxv8(__n64); __n8 neon_smaxvq8(__n128); __n16 neon_smaxv16(__n64); __n16 neon_smaxvq16(__n128); float neon_smaxvq32(__n128); __n64 neon_umax8(__n64, __n64); __n64 neon_umax16(__n64, __n64); __n64 neon_umax32(__n64, __n64); __n128 neon_umaxq8(__n128, __n128); __n128 neon_umaxq16(__n128, __n128); __n128 neon_umaxq32(__n128, __n128); __n64 neon_umaxp8(__n64, __n64); __n64 neon_umaxp16(__n64, __n64); __n64 neon_umaxp32(__n64, __n64); __n128 neon_umaxpq8(__n128, __n128); __n128 neon_umaxpq16(__n128, __n128); __n128 neon_umaxpq32(__n128, __n128); __n8 neon_umaxv8(__n64); __n8 neon_umaxvq8(__n128); __n16 neon_umaxv16(__n64); __n16 neon_umaxvq16(__n128); float neon_umaxvq32(__n128); #define vmax_f32(src1, src2) neon_fmax32(src1, src2) #define vmaxnm_f32(src1, src2) neon_fmaxnm32(src1, src2) #define vmaxq_f32(src1, src2) neon_fmaxq32(src1, src2) #define vmaxnmq_f32(src1, src2) neon_fmaxnmq32(src1, src2) #define vmax_s8(src1, src2) neon_smax8(src1, src2) #define vmax_s16(src1, src2) neon_smax16(src1, src2) #define vmax_s32(src1, src2) neon_smax32(src1, src2) #define vmax_u8(src1, src2) neon_umax8(src1, src2) #define vmax_u16(src1, src2) neon_umax16(src1, src2) #define vmax_u32(src1, src2) neon_umax32(src1, src2) #define vmaxq_s8(src1, src2) neon_smaxq8(src1, src2) #define vmaxq_s16(src1, src2) neon_smaxq16(src1, src2) #define vmaxq_s32(src1, src2) neon_smaxq32(src1, src2) #define vmaxq_u8(src1, src2) neon_umaxq8(src1, src2) #define vmaxq_u16(src1, src2) neon_umaxq16(src1, src2) #define vmaxq_u32(src1, src2) neon_umaxq32(src1, src2) #define vpmax_f32(src1, src2) neon_fmaxp32(src1, src2) #define vpmax_s8(src1, src2) neon_smaxp8(src1, src2) #define vpmax_s16(src1, src2) neon_smaxp16(src1, src2) #define vpmax_s32(src1, src2) neon_smaxp32(src1, src2) #define vpmax_u8(src1, src2) neon_umaxp8(src1, src2) #define vpmax_u16(src1, src2) neon_umaxp16(src1, src2) #define vpmax_u32(src1, src2) neon_umaxp32(src1, src2) // FMIN/FMINNM/FMINNMP/FMINNMV/FMINP/FMINV/SMIN/SMINP/SMINV/UMIN/UMINP/UMINV __n64 neon_fmin32(__n64, __n64); __n128 neon_fminq32(__n128, __n128); __n128 neon_fminq64(__n128, __n128); __n64 neon_fminnm32(__n64, __n64); __n128 neon_fminnmq32(__n128, __n128); __n128 neon_fminnmq64(__n128, __n128); __n64 neon_fminnmp32(__n64, __n64); __n128 neon_fminnmpq32(__n128, __n128); __n128 neon_fminnmpq64(__n128, __n128); float neon_fminnmps32(__n128); double neon_fminnmps64(__n128); float neon_fminnmv(__n128); __n64 neon_fminp32(__n64, __n64); __n128 neon_fminpq32(__n128, __n128); __n128 neon_fminpq64(__n128, __n128); float neon_fminps32(__n128); double neon_fminps64(__n128); float neon_fminv(__n128); __n64 neon_smin8(__n64, __n64); __n64 neon_smin16(__n64, __n64); __n64 neon_smin32(__n64, __n64); __n128 neon_sminq8(__n128, __n128); __n128 neon_sminq16(__n128, __n128); __n128 neon_sminq32(__n128, __n128); __n64 neon_sminp8(__n64, __n64); __n64 neon_sminp16(__n64, __n64); __n64 neon_sminp32(__n64, __n64); __n128 neon_sminpq8(__n128, __n128); __n128 neon_sminpq16(__n128, __n128); __n128 neon_sminpq32(__n128, __n128); __n8 neon_sminv8(__n64); __n8 neon_sminvq8(__n128); __n16 neon_sminv16(__n64); __n16 neon_sminvq16(__n128); float neon_sminvq32(__n128); __n64 neon_umin8(__n64, __n64); __n64 neon_umin16(__n64, __n64); __n64 neon_umin32(__n64, __n64); __n128 neon_uminq8(__n128, __n128); __n128 neon_uminq16(__n128, __n128); __n128 neon_uminq32(__n128, __n128); __n64 neon_uminp8(__n64, __n64); __n64 neon_uminp16(__n64, __n64); __n64 neon_uminp32(__n64, __n64); __n128 neon_uminpq8(__n128, __n128); __n128 neon_uminpq16(__n128, __n128); __n128 neon_uminpq32(__n128, __n128); __n8 neon_uminv8(__n64); __n8 neon_uminvq8(__n128); __n16 neon_uminv16(__n64); __n16 neon_uminvq16(__n128); float neon_uminvq32(__n128); #define vmin_f32(src1, src2) neon_fmin32(src1, src2) #define vminnm_f32(src1, src2) neon_fminnm32(src1, src2) #define vminq_f32(src1, src2) neon_fminq32(src1, src2) #define vminnmq_f32(src1, src2) neon_fminnmq32(src1, src2) #define vmin_s8(src1, src2) neon_smin8(src1, src2) #define vmin_s16(src1, src2) neon_smin16(src1, src2) #define vmin_s32(src1, src2) neon_smin32(src1, src2) #define vmin_u8(src1, src2) neon_umin8(src1, src2) #define vmin_u16(src1, src2) neon_umin16(src1, src2) #define vmin_u32(src1, src2) neon_umin32(src1, src2) #define vminq_s8(src1, src2) neon_sminq8(src1, src2) #define vminq_s16(src1, src2) neon_sminq16(src1, src2) #define vminq_s32(src1, src2) neon_sminq32(src1, src2) #define vminq_u8(src1, src2) neon_uminq8(src1, src2) #define vminq_u16(src1, src2) neon_uminq16(src1, src2) #define vminq_u32(src1, src2) neon_uminq32(src1, src2) #define vpmin_f32(src1, src2) neon_fminp32(src1, src2) #define vpmin_s8(src1, src2) neon_sminp8(src1, src2) #define vpmin_s16(src1, src2) neon_sminp16(src1, src2) #define vpmin_s32(src1, src2) neon_sminp32(src1, src2) #define vpmin_u8(src1, src2) neon_uminp8(src1, src2) #define vpmin_u16(src1, src2) neon_uminp16(src1, src2) #define vpmin_u32(src1, src2) neon_uminp32(src1, src2) // EXT __n64 neon_ext8(__n64, __n64, const int); __n64 neon_ext16(__n64, __n64, const int); __n64 neon_ext32(__n64, __n64, const int); __n64 neon_ext64(__n64, __n64, const int); __n128 neon_extq8(__n128, __n128, const int); __n128 neon_extq16(__n128, __n128, const int); __n128 neon_extq32(__n128, __n128, const int); __n128 neon_extq64(__n128, __n128, const int); #define vext_s8(src1, src2, pos) neon_ext8(src1, src2, pos) #define vext_u8(src1, src2, pos) neon_ext8(src1, src2, pos) #define vext_s16(src1, src2, pos) neon_ext16(src1, src2, pos) #define vext_u16(src1, src2, pos) neon_ext16(src1, src2, pos) #define vext_s32(src1, src2, pos) neon_ext32(src1, src2, pos) #define vext_u32(src1, src2, pos) neon_ext32(src1, src2, pos) #define vext_s64(src1, src2, pos) neon_ext64(src1, src2, pos) #define vext_u64(src1, src2, pos) neon_ext64(src1, src2, pos) #define vext_p8(src1, src2, pos) neon_ext8(src1, src2, pos) #define vext_p16(src1, src2, pos) neon_ext16(src1, src2, pos) #define vext_p64(src1, src2, pos) neon_ext64(src1, src2, pos) #define vext_f32(src1, src2, pos) neon_ext32(src1, src2, pos) #define vext_f64(src1, src2, pos) neon_ext64(src1, src2, pos) #define vextq_s8(src1, src2, pos) neon_extq8(src1, src2, pos) #define vextq_u8(src1, src2, pos) neon_extq8(src1, src2, pos) #define vextq_s16(src1, src2, pos) neon_extq16(src1, src2, pos) #define vextq_u16(src1, src2, pos) neon_extq16(src1, src2, pos) #define vextq_s32(src1, src2, pos) neon_extq32(src1, src2, pos) #define vextq_u32(src1, src2, pos) neon_extq32(src1, src2, pos) #define vextq_s64(src1, src2, pos) neon_extq64(src1, src2, pos) #define vextq_u64(src1, src2, pos) neon_extq64(src1, src2, pos) #define vextq_p8(src1, src2, pos) neon_extq8(src1, src2, pos) #define vextq_p16(src1, src2, pos) neon_extq16(src1, src2, pos) #define vextq_p64(src1, src2, pos) neon_extq64(src1, src2, pos) #define vextq_f32(src1, src2, pos) neon_extq32(src1, src2, pos) #define vextq_f64(src1, src2, pos) neon_extq64(src1, src2, pos) // FABD/SABD/SABA/UABD/UABA __n64 neon_fabd32(__n64, __n64); __n128 neon_fabdq32(__n128, __n128); __n128 neon_fabdq64(__n128, __n128); float neon_fabds32(float, float); double neon_fabds64(double, double); __n64 neon_sabd8(__n64, __n64); __n64 neon_sabd16(__n64, __n64); __n64 neon_sabd32(__n64, __n64); __n128 neon_sabdq8(__n128, __n128); __n128 neon_sabdq16(__n128, __n128); __n128 neon_sabdq32(__n128, __n128); __n64 neon_saba8(__n64, __n64, __n64); __n64 neon_saba16(__n64, __n64, __n64); __n64 neon_saba32(__n64, __n64, __n64); __n128 neon_sabaq8(__n128, __n128, __n128); __n128 neon_sabaq16(__n128, __n128, __n128); __n128 neon_sabaq32(__n128, __n128, __n128); __n64 neon_uabd8(__n64, __n64); __n64 neon_uabd16(__n64, __n64); __n64 neon_uabd32(__n64, __n64); __n128 neon_uabdq8(__n128, __n128); __n128 neon_uabdq16(__n128, __n128); __n128 neon_uabdq32(__n128, __n128); __n64 neon_uaba8(__n64, __n64, __n64); __n64 neon_uaba16(__n64, __n64, __n64); __n64 neon_uaba32(__n64, __n64, __n64); __n128 neon_uabaq8(__n128, __n128, __n128); __n128 neon_uabaq16(__n128, __n128, __n128); __n128 neon_uabaq32(__n128, __n128, __n128); #define vabd_f32(src1, src2) neon_fabd32(src1, src2) #define vabds_f32(src1, src2) neon_fabds32(src1, src2) #define vabd_f64(src1, src2) neon_fabds64(src1, src2) #define vabdq_f32(src1, src2) neon_fabdq32(src1, src2) #define vabd_s8(src1, src2) neon_sabd8(src1, src2) #define vabd_s16(src1, src2) neon_sabd16(src1, src2) #define vabd_s32(src1, src2) neon_sabd32(src1, src2) #define vabd_u8(src1, src2) neon_uabd8(src1, src2) #define vabd_u16(src1, src2) neon_uabd16(src1, src2) #define vabd_u32(src1, src2) neon_uabd32(src1, src2) #define vabdq_s8(src1, src2) neon_sabdq8(src1, src2) #define vabdq_s16(src1, src2) neon_sabdq16(src1, src2) #define vabdq_s32(src1, src2) neon_sabdq32(src1, src2) #define vabdq_u8(src1, src2) neon_uabdq8(src1, src2) #define vabdq_u16(src1, src2) neon_uabdq16(src1, src2) #define vabdq_u32(src1, src2) neon_uabdq32(src1, src2) #define vabdq_f64(src1, src2) neon_uabdq64(src1, src2) #define vaba_s8(src1, src2, src3) neon_saba8(src1, src2, src3) #define vaba_s16(src1, src2, src3) neon_saba16(src1, src2, src3) #define vaba_s32(src1, src2, src3) neon_saba32(src1, src2, src3) #define vaba_u8(src1, src2, src3) neon_uaba8(src1, src2, src3) #define vaba_u16(src1, src2, src3) neon_uaba16(src1, src2, src3) #define vaba_u32(src1, src2, src3) neon_uaba32(src1, src2, src3) #define vabaq_s8(src1, src2, src3) neon_sabaq8(src1, src2, src3) #define vabaq_s16(src1, src2, src3) neon_sabaq16(src1, src2, src3) #define vabaq_s32(src1, src2, src3) neon_sabaq32(src1, src2, src3) #define vabaq_u8(src1, src2, src3) neon_uabaq8(src1, src2, src3) #define vabaq_u16(src1, src2, src3) neon_uabaq16(src1, src2, src3) #define vabaq_u32(src1, src2, src3) neon_uabaq32(src1, src2, src3) // FDIV __n64 neon_fdiv32(__n64, __n64); __n128 neon_fdivq32(__n128, __n128); __n128 neon_fdivq64(__n128, __n128); #define vdiv_f32(src1, src2) neon_fdiv32(src1, src2) #define vdivq_f32(src1, src2) neon_fdivq32(src1, src2) // FSQRT/FRSQRTE/URSQRTE/FRSQRTS __n64 neon_fsqrt32(__n64); __n128 neon_fsqrtq32(__n128); __n128 neon_fsqrtq64(__n128); __n64 neon_frsqrte32(__n64); __n128 neon_frsqrteq32(__n128); __n128 neon_frsqrteq64(__n128); float neon_frsqrtes32(float); double neon_frsqrtes64(double); __n64 neon_ursqrte32(__n64); __n128 neon_ursqrteq32(__n128); __n64 neon_frsqrts32(__n64, __n64); __n128 neon_frsqrtsq32(__n128, __n128); __n128 neon_frsqrtsq64(__n128, __n128); float neon_frsqrtss32(float, float); double neon_frsqrtss64(double, double); #define vrsqrte_f32(src) neon_frsqrte32(src) #define vrsqrte_u32(src) neon_ursqrte32(src) #define vrsqrteq_f32(src) neon_frsqrteq32(src) #define vrsqrteq_u32(src) neon_ursqrteq32(src) #define vrsqrts_f32(src1, src2) neon_frsqrts32(src1, src2) #define vrsqrtsq_f32(src1, src2) neon_frsqrtsq32(src1, src2) // PMUL/MUL/MLA/MLS/SQDMULH/SQRDMULH/FMUL/FMLA/FMLS/FMULX __n64 neon_pmul(__n64, __n64); __n128 neon_pmulq(__n128, __n128); __n128 neon_pmull_8(__n64, __n64); __n128 neon_pmull_q8(__n128, __n128); __n128 neon_pmull2_8(__n128, __n128); __n128 neon_pmull_64(__n64, __n64); __n128 neon_pmull_q64(__n128, __n128); __n128 neon_pmull2_64(__n128, __n128); __n64 neon_fmulvind32 (__n64, __n64, const int); __n128 neon_fmulqvind32(__n128, __n64, const int); __n128 neon_fmulqvind32q(__n128, __n128, const int); __n128 neon_fmulqvind64(__n128, __n128, const int); __n64 neon_fmul32 (__n64, __n64); __n128 neon_fmulq32(__n128, __n128); __n128 neon_fmulq64(__n128, __n128); float neon_fmulsind32(float, __n128, const int); double neon_fmulsind64(double, __n128, const int); __n64 neon_fmlavind32 (__n64, __n64, __n64, const int); __n128 neon_fmlaqvind32(__n128, __n128, __n64, const int); __n128 neon_fmlaqvind32q(__n128, __n128, __n128, const int); __n128 neon_fmlaqvind64(__n128, __n128, __n128, const int); __n64 neon_fmla32 (__n64, __n64, __n64); __n128 neon_fmlaq32(__n128, __n128, __n128); __n128 neon_fmlaq64(__n128, __n128, __n128); float neon_fmlasind32(float, float, __n128, const int); double neon_fmlasind64(double, double, __n128, const int); __n64 neon_fmlsvind32 (__n64, __n64, __n64, const int); __n128 neon_fmlsqvind32(__n128, __n128, __n64, const int); __n128 neon_fmlsqvind32q(__n128, __n128, __n128, const int); __n128 neon_fmlsqvind64(__n128, __n128, __n128, const int); __n64 neon_fmls32 (__n64, __n64, __n64); __n128 neon_fmlsq32(__n128, __n128, __n128); __n128 neon_fmlsq64(__n128, __n128, __n128); float neon_fmlssind32(float, float, __n128, const int); double neon_fmlssind64(double, double, __n128, const int); __n64 neon_fmulxvind32 (__n64, __n64, const int); __n128 neon_fmulxqvind32(__n128, __n64, const int); __n128 neon_fmulxqvind32q(__n128, __n128, const int); __n128 neon_fmulxqvind64(__n128, __n128, const int); __n64 neon_fmulx32 (__n64, __n64); __n128 neon_fmulxq32(__n128, __n128); __n128 neon_fmulxq64(__n128, __n128); float neon_fmulxsind32(float, __n128, const int); double neon_fmulxsind64(double, __n128, const int); float neon_fmulxs32(float, float); double neon_fmulxs64(double, double); __n64 neon_mulvind16 (__n64, __n64, const int); __n64 neon_mulvind32 (__n64, __n64, const int); __n128 neon_mulqvind16(__n128, __n64, const int); __n128 neon_mulqvind32(__n128, __n64, const int); __n128 neon_mulqvind16q(__n128, __n128, const int); __n128 neon_mulqvind32q(__n128, __n128, const int); __n64 neon_mul8 (__n64, __n64); __n64 neon_mul16 (__n64, __n64); __n64 neon_mul32 (__n64, __n64); __n128 neon_mulq8 (__n128, __n128); __n128 neon_mulq16(__n128, __n128); __n128 neon_mulq32(__n128, __n128); __n64 neon_mlsvind16 (__n64, __n64, __n64, const int); __n64 neon_mlsvind32 (__n64, __n64, __n64, const int); __n128 neon_mlsqvind16(__n128, __n128, __n64, const int); __n128 neon_mlsqvind32(__n128, __n128, __n64, const int); __n128 neon_mlsqvind16q(__n128, __n128, __n128, const int); __n128 neon_mlsqvind32q(__n128, __n128, __n128, const int); __n64 neon_mls8 (__n64, __n64, __n64); __n64 neon_mls16 (__n64, __n64, __n64); __n64 neon_mls32 (__n64, __n64, __n64); __n128 neon_mlsq8 (__n128, __n128, __n128); __n128 neon_mlsq16(__n128, __n128, __n128); __n128 neon_mlsq32(__n128, __n128, __n128); __n64 neon_mlavind16 (__n64, __n64, __n64, const int); __n64 neon_mlavind32 (__n64, __n64, __n64, const int); __n128 neon_mlaqvind16(__n128, __n128, __n64, const int); __n128 neon_mlaqvind32(__n128, __n128, __n64, const int); __n128 neon_mlaqvind16q(__n128, __n128, __n128, const int); __n128 neon_mlaqvind32q(__n128, __n128, __n128, const int); __n64 neon_mla8 (__n64, __n64, __n64); __n64 neon_mla16 (__n64, __n64, __n64); __n64 neon_mla32 (__n64, __n64, __n64); __n128 neon_mlaq8 (__n128, __n128, __n128); __n128 neon_mlaq16(__n128, __n128, __n128); __n128 neon_mlaq32(__n128, __n128, __n128); __n64 neon_sqdmulhvind16 (__n64, __n64, const int); __n64 neon_sqdmulhvind32 (__n64, __n64, const int); __n128 neon_sqdmulhqvind16(__n128, __n64, const int); __n128 neon_sqdmulhqvind32(__n128, __n64, const int); __n128 neon_sqdmulhqvind16q(__n128, __n128, const int); __n128 neon_sqdmulhqvind32q(__n128, __n128, const int); __n64 neon_sqdmulh16 (__n64, __n64); __n64 neon_sqdmulh32 (__n64, __n64); __n128 neon_sqdmulhq16(__n128, __n128); __n128 neon_sqdmulhq32(__n128, __n128); __n16 neon_sqdmulhsind16(__n16, __n128, const int); float neon_sqdmulhsind32(float, __n128, const int); __n16 neon_sqdmulhs16 (__n16, __n16); float neon_sqdmulhs32 (float, float); __n64 neon_sqrdmulhvind16 (__n64, __n64, const int); __n64 neon_sqrdmulhvind32 (__n64, __n64, const int); __n128 neon_sqrdmulhqvind16(__n128, __n64, const int); __n128 neon_sqrdmulhqvind32(__n128, __n64, const int); __n128 neon_sqrdmulhqvind16q(__n128, __n128, const int); __n128 neon_sqrdmulhqvind32q(__n128, __n128, const int); __n64 neon_sqrdmulh16 (__n64, __n64); __n64 neon_sqrdmulh32 (__n64, __n64); __n128 neon_sqrdmulhq16(__n128, __n128); __n128 neon_sqrdmulhq32(__n128, __n128); __n16 neon_sqrdmulhsind16(__n16, __n128, const int); float neon_sqrdmulhsind32(float, __n128, const int); __n16 neon_sqrdmulhs16 (__n16, __n16); float neon_sqrdmulhs32 (float, float); #define vmul_p8(src1, src2) neon_pmul(src1, src2) #define vmull_p8(src1, src2) neon_pmull_8(src1, src2) #define vmullq_p8(src1, src2) neon_pmull_q8(src1, src2) #define vmull_high_p8(src1, src2) neon_pmull2_8(src1, src2) #define vmull_p64(src1, src2) neon_pmull_64(src1, src2) #define vmullq_p64(src1, src2) neon_pmull_q64(src1, src2) #define vmull_high_p64(src1, src2) neon_pmull2_64(src1, src2) #define vmul_f32(src1, src2) neon_fmul32(src1, src2) #define vmul_s16(src1, src2) neon_mul16(src1, src2) #define vmul_s32(src1, src2) neon_mul32(src1, src2) #define vmul_s8(src1, src2) neon_mul8(src1, src2) #define vmul_u16(src1, src2) neon_mul16(src1, src2) #define vmul_u32(src1, src2) neon_mul32(src1, src2) #define vmul_u8(src1, src2) neon_mul8(src1, src2) #define vmulq_p8(src1, src2) neon_pmulq(src1, src2) #define vmulq_f32(src1, src2) neon_fmulq32(src1, src2) #define vmulq_s16(src1, src2) neon_mulq16(src1, src2) #define vmulq_s32(src1, src2) neon_mulq32(src1, src2) #define vmulq_s8(src1, src2) neon_mulq8(src1, src2) #define vmulq_u16(src1, src2) neon_mulq16(src1, src2) #define vmulq_u32(src1, src2) neon_mulq32(src1, src2) #define vmulq_u8(src1, src2) neon_mulq8(src1, src2) #define vmul_lane_f32(src1, src2, lane) neon_fmulvind32(src1, src2, lane) #define vmul_lane_s16(src1, src2, lane) neon_mulvind16(src1, src2, lane) #define vmul_lane_s32(src1, src2, lane) neon_mulvind32(src1, src2, lane) #define vmul_lane_u16(src1, src2, lane) neon_mulvind16(src1, src2, lane) #define vmul_lane_u32(src1, src2, lane) neon_mulvind32(src1, src2, lane) #define vmulq_lane_f32(src1, src2, lane) neon_fmulqvind32(src1, src2, lane) #define vmulq_lane_s16(src1, src2, lane) neon_mulqvind16(src1, src2, lane) #define vmulq_lane_s32(src1, src2, lane) neon_mulqvind32(src1, src2, lane) #define vmulq_lane_u16(src1, src2, lane) neon_mulqvind16(src1, src2, lane) #define vmulq_lane_u32(src1, src2, lane) neon_mulqvind32(src1, src2, lane) #define vqdmulh_lane_s16(src1, src2, lane) neon_sqdmulhvind16(src1, src2, lane) #define vqdmulh_lane_s32(src1, src2, lane) neon_sqdmulhvind32(src1, src2, lane) #define vqrdmulh_lane_s16(src1, src2, lane) neon_sqrdmulhvind16(src1, src2, lane) #define vqrdmulh_lane_s32(src1, src2, lane) neon_sqrdmulhvind32(src1, src2, lane) #define vqdmulhq_lane_s16(src1, src2, lane) neon_sqdmulhqvind16(src1, src2, lane) #define vqdmulhq_lane_s32(src1, src2, lane) neon_sqdmulhqvind32(src1, src2, lane) #define vqrdmulhq_lane_s16(src1, src2, lane) neon_sqrdmulhqvind16(src1, src2, lane) #define vqrdmulhq_lane_s32(src1, src2, lane) neon_sqrdmulhqvind32(src1, src2, lane) #define vqdmulh_s16(src1, src2) neon_sqdmulh16(src1, src2) #define vqdmulh_s32(src1, src2) neon_sqdmulh32(src1, src2) #define vqrdmulh_s16(src1, src2) neon_sqrdmulh16(src1, src2) #define vqrdmulh_s32(src1, src2) neon_sqrdmulh32(src1, src2) #define vqdmulhq_s16(src1, src2) neon_sqdmulhq16(src1, src2) #define vqdmulhq_s32(src1, src2) neon_sqdmulhq32(src1, src2) #define vqrdmulhq_s16(src1, src2) neon_sqrdmulhq16(src1, src2) #define vqrdmulhq_s32(src1, src2) neon_sqrdmulhq32(src1, src2) #define vmla_lane_f32(dst, src1, src2, lane) neon_fmlavind32(dst, src1, src2, lane) #define vmla_lane_s16(dst, src1, src2, lane) neon_mlavind16(dst, src1, src2, lane) #define vmla_lane_s32(dst, src1, src2, lane) neon_mlavind32(dst, src1, src2, lane) #define vmla_lane_u16(dst, src1, src2, lane) neon_mlavind16(dst, src1, src2, lane) #define vmla_lane_u32(dst, src1, src2, lane) neon_mlavind32(dst, src1, src2, lane) #define vmls_lane_f32(dst, src1, src2, lane) neon_fmlsvind32(dst, src1, src2, lane) #define vmls_lane_s16(dst, src1, src2, lane) neon_mlsvind16(dst, src1, src2, lane) #define vmls_lane_s32(dst, src1, src2, lane) neon_mlsvind32(dst, src1, src2, lane) #define vmls_lane_u16(dst, src1, src2, lane) neon_mlsvind16(dst, src1, src2, lane) #define vmls_lane_u32(dst, src1, src2, lane) neon_mlsvind32(dst, src1, src2, lane) #define vmlaq_lane_f32(dst, src1, src2, lane) neon_fmlaqvind32(dst, src1, src2, lane) #define vmlaq_lane_s16(dst, src1, src2, lane) neon_mlaqvind16(dst, src1, src2, lane) #define vmlaq_lane_s32(dst, src1, src2, lane) neon_mlaqvind32(dst, src1, src2, lane) #define vmlaq_lane_u16(dst, src1, src2, lane) neon_mlaqvind16(dst, src1, src2, lane) #define vmlaq_lane_u32(dst, src1, src2, lane) neon_mlaqvind32(dst, src1, src2, lane) #define vmlsq_lane_f32(dst, src1, src2, lane) neon_fmlsqvind32(dst, src1, src2, lane) #define vmlsq_lane_s16(dst, src1, src2, lane) neon_mlsqvind16(dst, src1, src2, lane) #define vmlsq_lane_s32(dst, src1, src2, lane) neon_mlsqvind32(dst, src1, src2, lane) #define vmlsq_lane_u16(dst, src1, src2, lane) neon_mlsqvind16(dst, src1, src2, lane) #define vmlsq_lane_u32(dst, src1, src2, lane) neon_mlsqvind32(dst, src1, src2, lane) #define vmla_f32(dst, src1, src2) neon_fmla32(dst, src1, src2) #define vmls_f32(dst, src1, src2) neon_fmls32(dst, src1, src2) #define vmlaq_f32(dst, src1, src2) neon_fmlaq32(dst, src1, src2) #define vmlsq_f32(dst, src1, src2) neon_fmlsq32(dst, src1, src2) #define vmla_s16(dst, src1, src2) neon_mla16(dst, src1, src2) #define vmla_s32(dst, src1, src2) neon_mla32(dst, src1, src2) #define vmla_s8(dst, src1, src2) neon_mla8(dst, src1, src2) #define vmla_u16(dst, src1, src2) neon_mla16(dst, src1, src2) #define vmla_u32(dst, src1, src2) neon_mla32(dst, src1, src2) #define vmla_u8(dst, src1, src2) neon_mla8(dst, src1, src2) #define vmls_s16(dst, src1, src2) neon_mls16(dst, src1, src2) #define vmls_s32(dst, src1, src2) neon_mls32(dst, src1, src2) #define vmls_s8(dst, src1, src2) neon_mls8(dst, src1, src2) #define vmls_u16(dst, src1, src2) neon_mls16(dst, src1, src2) #define vmls_u32(dst, src1, src2) neon_mls32(dst, src1, src2) #define vmls_u8(dst, src1, src2) neon_mls8(dst, src1, src2) #define vmlaq_s16(dst, src1, src2) neon_mlaq16(dst, src1, src2) #define vmlaq_s32(dst, src1, src2) neon_mlaq32(dst, src1, src2) #define vmlaq_s8(dst, src1, src2) neon_mlaq8(dst, src1, src2) #define vmlaq_u16(dst, src1, src2) neon_mlaq16(dst, src1, src2) #define vmlaq_u32(dst, src1, src2) neon_mlaq32(dst, src1, src2) #define vmlaq_u8(dst, src1, src2) neon_mlaq8(dst, src1, src2) #define vmlsq_s16(dst, src1, src2) neon_mlsq16(dst, src1, src2) #define vmlsq_s32(dst, src1, src2) neon_mlsq32(dst, src1, src2) #define vmlsq_s8(dst, src1, src2) neon_mlsq8(dst, src1, src2) #define vmlsq_u16(dst, src1, src2) neon_mlsq16(dst, src1, src2) #define vmlsq_u32(dst, src1, src2) neon_mlsq32(dst, src1, src2) #define vmlsq_u8(dst, src1, src2) neon_mlsq8(dst, src1, src2) #define vfmaq_f32(dst, src1, src2) neon_fmlaq32(dst, src1, src2) #define vfmsq_f32(dst, src1, src2) neon_fmlsq32(dst, src1, src2) // Multiply by scalar #define vmul_n_s16(Vd, Rt) vmul_lane_s16((Vd), vmov_n_s16(Rt), 0) #define vmul_n_s32(Vd, Rt) vmul_lane_s32((Vd), vmov_n_s32(Rt), 0) #define vmul_n_u16(Vd, Rt) vmul_lane_u16((Vd), vmov_n_u16(Rt), 0) #define vmul_n_u32(Vd, Rt) vmul_lane_u32((Vd), vmov_n_u32(Rt), 0) #define vmul_n_f32(Vd, Rt) vmul_lane_f32((Vd), vmov_n_f32(Rt), 0) #define vmulq_n_s16(Vd, Rt) vmulq_lane_s16((Vd), vmov_n_s16(Rt), 0) #define vmulq_n_s32(Vd, Rt) vmulq_lane_s32((Vd), vmov_n_s32(Rt), 0) #define vmulq_n_u16(Vd, Rt) vmulq_lane_u16((Vd), vmov_n_u16(Rt), 0) #define vmulq_n_u32(Vd, Rt) vmulq_lane_u32((Vd), vmov_n_u32(Rt), 0) #define vmulq_n_f32(Vd, Rt) vmulq_lane_f32((Vd), vmov_n_f32(Rt), 0) #define vqdmulh_n_s16(Vd, Rt) vqdmulh_lane_s16((Vd), vmov_n_s16(Rt), 0) #define vqdmulh_n_s32(Vd, Rt) vqdmulh_lane_s32((Vd), vmov_n_s32(Rt), 0) #define vqdmulhq_n_s16(Vd, Rt) vqdmulhq_lane_s16((Vd), vmov_n_s16(Rt), 0) #define vqdmulhq_n_s32(Vd, Rt) vqdmulhq_lane_s32((Vd), vmov_n_s32(Rt), 0) #define vqrdmulh_n_s16(Vd, Rt) vqrdmulh_lane_s16((Vd), vmov_n_s16(Rt), 0) #define vqrdmulh_n_s32(Vd, Rt) vqrdmulh_lane_s32((Vd), vmov_n_s32(Rt), 0) #define vqrdmulhq_n_s16(Vd, Rt) vqrdmulhq_lane_s16((Vd), vmov_n_s16(Rt), 0) #define vqrdmulhq_n_s32(Vd, Rt) vqrdmulhq_lane_s32((Vd), vmov_n_s32(Rt), 0) // Multiply by scalar with accumulate #define vmla_n_s16(Vd, Vn, Rt) vmla_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0) #define vmla_n_s32(Vd, Vn, Rt) vmla_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0) #define vmla_n_u16(Vd, Vn, Rt) vmla_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0) #define vmla_n_u32(Vd, Vn, Rt) vmla_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0) #define vmla_n_f32(Vd, Vn, Rt) vmla_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0) #define vmlaq_n_s16(Vd, Vn, Rt) vmlaq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0) #define vmlaq_n_s32(Vd, Vn, Rt) vmlaq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0) #define vmlaq_n_f32(Vd, Vn, Rt) vmlaq_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0) #define vmlaq_n_u16(Vd, Vn, Rt) vmlaq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0) #define vmlaq_n_u32(Vd, Vn, Rt) vmlaq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0) #define vmls_n_s16(Vd, Vn, Rt) vmls_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0) #define vmls_n_s32(Vd, Vn, Rt) vmls_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0) #define vmls_n_u16(Vd, Vn, Rt) vmls_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0) #define vmls_n_u32(Vd, Vn, Rt) vmls_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0) #define vmls_n_f32(Vd, Vn, Rt) vmls_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0) #define vmlsq_n_s16(Vd, Vn, Rt) vmlsq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0) #define vmlsq_n_s32(Vd, Vn, Rt) vmlsq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0) #define vmlsq_n_u16(Vd, Vn, Rt) vmlsq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0) #define vmlsq_n_u32(Vd, Vn, Rt) vmlsq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0) #define vmlsq_n_f32(Vd, Vn, Rt) vmlsq_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0) // SMULL(2)/UMULL(2)/SMLAL(2)/UMLAL(2)/SMLSL(2)/UMLSL(2)/SQDMULL(2)/SQDMLAL(2)/SQDMLSL(2) __n128 neon_smull_8(__n64, __n64); __n128 neon_smull_16(__n64, __n64); __n128 neon_smull_32(__n64, __n64); __n128 neon_smull2_8(__n128, __n128); __n128 neon_smull2_16(__n128, __n128); __n128 neon_smull2_32(__n128, __n128); __n128 neon_smull_i16(__n64, __n64, const int); __n128 neon_smull_i32(__n64, __n64, const int); __n128 neon_smull2_i16(__n128, __n64, const int); __n128 neon_smull2_i32(__n128, __n64, const int); __n128 neon_smull_qi16(__n64, __n128, const int); __n128 neon_smull_qi32(__n64, __n128, const int); __n128 neon_smull2_qi16(__n128, __n128, const int); __n128 neon_smull2_qi32(__n128, __n128, const int); __n128 neon_umull_8(__n64, __n64); __n128 neon_umull_16(__n64, __n64); __n128 neon_umull_32(__n64, __n64); __n128 neon_umull2_8(__n128, __n128); __n128 neon_umull2_16(__n128, __n128); __n128 neon_umull2_32(__n128, __n128); __n128 neon_umull_i16(__n64, __n64, const int); __n128 neon_umull_i32(__n64, __n64, const int); __n128 neon_umull2_i16(__n128, __n64, const int); __n128 neon_umull2_i32(__n128, __n64, const int); __n128 neon_umull_qi16(__n64, __n128, const int); __n128 neon_umull_qi32(__n64, __n128, const int); __n128 neon_umull2_qi16(__n128, __n128, const int); __n128 neon_umull2_qi32(__n128, __n128, const int); #define vmull_s8(src1, src2) neon_smull_8(src1, src2) #define vmull_s16(src1, src2) neon_smull_16(src1, src2) #define vmull_s32(src1, src2) neon_smull_32(src1, src2) #define vmull_high_s8(src1, src2) neon_smull2_8(src1, src2) #define vmull_high_s16(src1, src2) neon_smull2_16(src1, src2) #define vmull_high_s32(src1, src2) neon_smull2_32(src1, src2) #define vmull_lane_s16(src1, src2, src3) neon_smull_i16(src1, src2, src3) #define vmull_lane_s32(src1, src2, src3) neon_smull_i32(src1, src2, src3) #define vmull_high_lane_s16(src1, src2, src3) neon_smull2_i16(src1, src2, src3) #define vmull_high_lane_s32(src1, src2, src3) neon_smull2_i32(src1, src2, src3) #define vmull_laneq_s16(src1, src2, src3) neon_smull_qi16(src1, src2, src3) #define vmull_laneq_s32(src1, src2, src3) neon_smull_qi32(src1, src2, src3) #define vmull_high_laneq_s16(src1, src2, src3) neon_smull2_qi16(src1, src2, src3) #define vmull_high_laneq_s32(src1, src2, src3) neon_smull2_qi32(src1, src2, src3) #define vmull_u8(src1, src2) neon_umull_8(src1, src2) #define vmull_u16(src1, src2) neon_umull_16(src1, src2) #define vmull_u32(src1, src2) neon_umull_32(src1, src2) #define vmull_high_u8(src1, src2) neon_umull2_8(src1, src2) #define vmull_high_u16(src1, src2) neon_umull2_16(src1, src2) #define vmull_high_u32(src1, src2) neon_umull2_32(src1, src2) #define vmull_lane_u16(src1, src2, src3) neon_umull_i16(src1, src2, src3) #define vmull_lane_u32(src1, src2, src3) neon_umull_i32(src1, src2, src3) #define vmull_high_lane_u16(src1, src2, src3) neon_umull2_i16(src1, src2, src3) #define vmull_high_lane_u32(src1, src2, src3) neon_umull2_i32(src1, src2, src3) #define vmull_laneq_u16(src1, src2, src3) neon_umull_qi16(src1, src2, src3) #define vmull_laneq_u32(src1, src2, src3) neon_umull_qi32(src1, src2, src3) #define vmull_high_laneq_u16(src1, src2, src3) neon_umull2_qi16(src1, src2, src3) #define vmull_high_laneq_u32(src1, src2, src3) neon_umull2_qi32(src1, src2, src3) #define vmull_n_s16(src1, src2) vmull_lane_s16(src1, vmov_n_s16(src2), 0) #define vmull_n_s32(src1, src2) vmull_lane_s32(src1, vmov_n_s32(src2), 0) #define vmull_high_n_s16(src1, src2) vmull_high_lane_s16(src1, vmov_n_s16(src2), 0) #define vmull_high_n_s32(src1, src2) vmull_high_lane_s32(src1, vmov_n_s32(src2), 0) #define vmull_n_u16(src1, src2) vmull_lane_u16(src1, vmov_n_u16(src2), 0) #define vmull_n_u32(src1, src2) vmull_lane_u32(src1, vmov_n_u32(src2), 0) #define vmull_high_n_u16(src1, src2) vmull_high_lane_u16(src1, vmov_n_u16(src2), 0) #define vmull_high_n_u32(src1, src2) vmull_high_lane_u32(src1, vmov_n_u32(src2), 0) __n128 neon_smlal_8(__n128, __n64, __n64); __n128 neon_smlal_16(__n128, __n64, __n64); __n128 neon_smlal_32(__n128, __n64, __n64); __n128 neon_smlal2_8(__n128, __n64, __n128); __n128 neon_smlal2_16(__n128, __n128, __n128); __n128 neon_smlal2_32(__n128, __n128, __n128); __n128 neon_smlal_i16(__n128, __n64, __n64, const int); __n128 neon_smlal_i32(__n128, __n64, __n64, const int); __n128 neon_smlal2_i16(__n128, __n128, __n64, const int); __n128 neon_smlal2_i32(__n128, __n128, __n64, const int); __n128 neon_smlal_qi16(__n128, __n64, __n128, const int); __n128 neon_smlal_qi32(__n128, __n64, __n128, const int); __n128 neon_smlal2_qi16(__n128, __n128, __n128, const int); __n128 neon_smlal2_qi32(__n128, __n128, __n128, const int); __n128 neon_umlal_8(__n128, __n64, __n64); __n128 neon_umlal_16(__n128, __n64, __n64); __n128 neon_umlal_32(__n128, __n64, __n64); __n128 neon_umlal2_8(__n128, __n128, __n128); __n128 neon_umlal2_16(__n128, __n128, __n128); __n128 neon_umlal2_32(__n128, __n128, __n128); __n128 neon_umlal_i16(__n128, __n64, __n64, const int); __n128 neon_umlal_i32(__n128, __n64, __n64, const int); __n128 neon_umlal2_i16(__n128, __n128, __n64, const int); __n128 neon_umlal2_i32(__n128, __n128, __n64, const int); __n128 neon_umlal_qi16(__n128, __n64, __n128, const int); __n128 neon_umlal_qi32(__n128, __n64, __n128, const int); __n128 neon_umlal2_qi16(__n128, __n128, __n128, const int); __n128 neon_umlal2_qi32(__n128, __n128, __n128, const int); #define vmlal_s8(src1, src2, src3) neon_smlal_8(src1, src2, src3) #define vmlal_s16(src1, src2, src3) neon_smlal_16(src1, src2, src3) #define vmlal_s32(src1, src2, src3) neon_smlal_32(src1, src2, src3) #define vmlal_high_s8(src1, src2, src3) neon_smlal2_8(src1, src2, src3) #define vmlal_high_s16(src1, src2, src3) neon_smlal2_16(src1, src2, src3) #define vmlal_high_s32(src1, src2, src3) neon_smlal2_32(src1, src2, src3) #define vmlal_lane_s16(src1, src2, src3, src4) neon_smlal_i16(src1, src2, src3, src4) #define vmlal_lane_s32(src1, src2, src3, src4) neon_smlal_i32(src1, src2, src3, src4) #define vmlal_high_lane_s16(src1, src2, src3, src4) neon_smlal2_i16(src1, src2, src3, src4) #define vmlal_high_lane_s32(src1, src2, src3, src4) neon_smlal2_i32(src1, src2, src3, src4) #define vmlal_laneq_s16(src1, src2, src3, src4) neon_smlal_qi16(src1, src2, src3, src4) #define vmlal_laneq_s32(src1, src2, src3, src4) neon_smlal_qi32(src1, src2, src3, src4) #define vmlal_high_laneq_s16(src1, src2, src3, src4) neon_smlal2_qi16(src1, src2, src3, src4) #define vmlal_high_laneq_s32(src1, src2, src3, src4) neon_smlal2_qi32(src1, src2, src3, src4) #define vmlal_u8(src1, src2, src3) neon_umlal_8(src1, src2, src3) #define vmlal_u16(src1, src2, src3) neon_umlal_16(src1, src2, src3) #define vmlal_u32(src1, src2, src3) neon_umlal_32(src1, src2, src3) #define vmlal_high_u8(src1, src2, src3) neon_umlal2_8(src1, src2, src3) #define vmlal_high_u16(src1, src2, src3) neon_umlal2_16(src1, src2, src3) #define vmlal_high_u32(src1, src2, src3) neon_umlal2_32(src1, src2, src3) #define vmlal_lane_u16(src1, src2, src3, src4) neon_umlal_i16(src1, src2, src3, src4) #define vmlal_lane_u32(src1, src2, src3, src4) neon_umlal_i32(src1, src2, src3, src4) #define vmlal_high_lane_u16(src1, src2, src3, src4) neon_umlal2_i16(src1, src2, src3, src4) #define vmlal_high_lane_u32(src1, src2, src3, src4) neon_umlal2_i32(src1, src2, src3, src4) #define vmlal_laneq_u16(src1, src2, src3, src4) neon_umlal_qi16(src1, src2, src3, src4) #define vmlal_laneq_u32(src1, src2, src3, src4) neon_umlal_qi32(src1, src2, src3, src4) #define vmlal_high_laneq_u16(src1, src2, src3, src4) neon_umlal2_qi16(src1, src2, src3, src4) #define vmlal_high_laneq_u32(src1, src2, src3, src4) neon_umlal2_qi32(src1, src2, src3, src4) #define vmlal_n_s16(src1, src2, src3) vmlal_lane_s16(src1, src2, vmov_n_s16(src3), 0) #define vmlal_n_s32(src1, src2, src3) vmlal_lane_s32(src1, src2, vmov_n_s32(src3), 0) #define vmlal_high_n_s16(src1, src2, src3) vmlal_high_lane_s16(src1, src2, vmov_n_s16(src3), 0) #define vmlal_high_n_s32(src1, src2, src3) vmlal_high_lane_s32(src1, src2, vmov_n_s32(src3), 0) #define vmlal_n_u16(src1, src2, src3) vmlal_lane_u16(src1, src2, vmov_n_u16(src3), 0) #define vmlal_n_u32(src1, src2, src3) vmlal_lane_u32(src1, src2, vmov_n_u32(src3), 0) #define vmlal_high_n_u16(src1, src2, src3) vmlal_high_lane_u16(src1, src2, vmov_n_u16(src3), 0) #define vmlal_high_n_u32(src1, src2, src3) vmlal_high_lane_u32(src1, src2, vmov_n_u32(src3), 0) __n128 neon_smlsl_8(__n128, __n64, __n64); __n128 neon_smlsl_16(__n128, __n64, __n64); __n128 neon_smlsl_32(__n128, __n64, __n64); __n128 neon_smlsl2_8(__n128, __n128, __n128); __n128 neon_smlsl2_16(__n128, __n128, __n128); __n128 neon_smlsl2_32(__n128, __n128, __n128); __n128 neon_smlsl_i16(__n128, __n64, __n64, const int); __n128 neon_smlsl_i32(__n128, __n64, __n64, const int); __n128 neon_smlsl2_i16(__n128, __n128, __n64, const int); __n128 neon_smlsl2_i32(__n128, __n128, __n64, const int); __n128 neon_smlsl_qi16(__n128, __n64, __n128, const int); __n128 neon_smlsl_qi32(__n128, __n64, __n128, const int); __n128 neon_smlsl2_qi16(__n128, __n128, __n128, const int); __n128 neon_smlsl2_qi32(__n128, __n128, __n128, const int); __n128 neon_umlsl_8(__n128, __n64, __n64); __n128 neon_umlsl_16(__n128, __n64, __n64); __n128 neon_umlsl_32(__n128, __n64, __n64); __n128 neon_umlsl2_8(__n128, __n128, __n128); __n128 neon_umlsl2_16(__n128, __n128, __n128); __n128 neon_umlsl2_32(__n128, __n128, __n128); __n128 neon_umlsl_i16(__n128, __n64, __n64, const int); __n128 neon_umlsl_i32(__n128, __n64, __n64, const int); __n128 neon_umlsl2_i16(__n128, __n128, __n64, const int); __n128 neon_umlsl2_i32(__n128, __n128, __n64, const int); __n128 neon_umlsl_qi16(__n128, __n64, __n128, const int); __n128 neon_umlsl_qi32(__n128, __n64, __n128, const int); __n128 neon_umlsl2_qi16(__n128, __n128, __n128, const int); __n128 neon_umlsl2_qi32(__n128, __n128, __n128, const int); #define vmlsl_s8(src1, src2, src3) neon_smlsl_8(src1, src2, src3) #define vmlsl_s16(src1, src2, src3) neon_smlsl_16(src1, src2, src3) #define vmlsl_s32(src1, src2, src3) neon_smlsl_32(src1, src2, src3) #define vmlsl_high_s8(src1, src2, src3) neon_smlsl2_8(src1, src2, src3) #define vmlsl_high_s16(src1, src2, src3) neon_smlsl2_16(src1, src2, src3) #define vmlsl_high_s32(src1, src2, src3) neon_smlsl2_32(src1, src2, src3) #define vmlsl_lane_s16(src1, src2, src3, src4) neon_smlsl_i16(src1, src2, src3, src4) #define vmlsl_lane_s32(src1, src2, src3, src4) neon_smlsl_i32(src1, src2, src3, src4) #define vmlsl_high_lane_s16(src1, src2, src3, src4) neon_smlsl2_i16(src1, src2, src3, src4) #define vmlsl_high_lane_s32(src1, src2, src3, src4) neon_smlsl2_i32(src1, src2, src3, src4) #define vmlsl_laneq_s16(src1, src2, src3, src4) neon_smlsl_qi16(src1, src2, src3, src4) #define vmlsl_laneq_s32(src1, src2, src3, src4) neon_smlsl_qi32(src1, src2, src3, src4) #define vmlsl_high_laneq_s16(src1, src2, src3, src4) neon_smlsl2_qi16(src1, src2, src3, src4) #define vmlsl_high_laneq_s32(src1, src2, src3, src4) neon_smlsl2_qi32(src1, src2, src3, src4) #define vmlsl_u8(src1, src2, src3) neon_umlsl_8(src1, src2, src3) #define vmlsl_u16(src1, src2, src3) neon_umlsl_16(src1, src2, src3) #define vmlsl_u32(src1, src2, src3) neon_umlsl_32(src1, src2, src3) #define vmlsl_high_u8(src1, src2, src3) neon_umlsl2_8(src1, src2, src3) #define vmlsl_high_u16(src1, src2, src3) neon_umlsl2_16(src1, src2, src3) #define vmlsl_high_u32(src1, src2, src3) neon_umlsl2_32(src1, src2, src3) #define vmlsl_lane_u16(src1, src2, src3, src4) neon_umlsl_i16(src1, src2, src3, src4) #define vmlsl_lane_u32(src1, src2, src3, src4) neon_umlsl_i32(src1, src2, src3, src4) #define vmlsl_high_lane_u16(src1, src2, src3, src4) neon_umlsl2_i16(src1, src2, src3, src4) #define vmlsl_high_lane_u32(src1, src2, src3, src4) neon_umlsl2_i32(src1, src2, src3, src4) #define vmlsl_laneq_u16(src1, src2, src3, src4) neon_umlsl_qi16(src1, src2, src3, src4) #define vmlsl_laneq_u32(src1, src2, src3, src4) neon_umlsl_qi32(src1, src2, src3, src4) #define vmlsl_high_laneq_u16(src1, src2, src3, src4) neon_umlsl2_qi16(src1, src2, src3, src4) #define vmlsl_high_laneq_u32(src1, src2, src3, src4) neon_umlsl2_qi32(src1, src2, src3, src4) #define vmlsl_n_s16(src1, src2, src3) vmlsl_lane_s16(src1, src2, vmov_n_s16(src3), 0) #define vmlsl_n_s32(src1, src2, src3) vmlsl_lane_s32(src1, src2, vmov_n_s32(src3), 0) #define vmlsl_high_n_s16(src1, src2, src3) vmlsl_high_lane_s16(src1, src2, vmov_n_s16(src3), 0) #define vmlsl_high_n_s32(src1, src2, src3) vmlsl_high_lane_s32(src1, src2, vmov_n_s32(src3), 0) #define vmlsl_n_u16(src1, src2, src3) vmlsl_lane_u16(src1, src2, vmov_n_u16(src3), 0) #define vmlsl_n_u32(src1, src2, src3) vmlsl_lane_u32(src1, src2, vmov_n_u32(src3), 0) #define vmlsl_high_n_u16(src1, src2, src3) vmlsl_high_lane_u16(src1, src2, vmov_n_u16(src3), 0) #define vmlsl_high_n_u32(src1, src2, src3) vmlsl_high_lane_u32(src1, src2, vmov_n_u32(src3), 0) __n128 neon_sqdmull_16(__n64, __n64); __n128 neon_sqdmull_32(__n64, __n64); __n128 neon_sqdmull2_16(__n128, __n128); __n128 neon_sqdmull2_32(__n128, __n128); __n128 neon_sqdmull_i16(__n64, __n64, const int); __n128 neon_sqdmull_i32(__n64, __n64, const int); __n128 neon_sqdmull2_i16(__n128, __n64, const int); __n128 neon_sqdmull2_i32(__n128, __n64, const int); __n128 neon_sqdmull_qi16(__n64, __n128, const int); __n128 neon_sqdmull_qi32(__n64, __n128, const int); __n128 neon_sqdmull2_qi16(__n128, __n128, const int); __n128 neon_sqdmull2_qi32(__n128, __n128, const int); #define vqdmull_s16(src1, src2) neon_sqdmull_16(src1, src2) #define vqdmull_s32(src1, src2) neon_sqdmull_32(src1, src2) #define vqdmull_high_s16(src1, src2) neon_sqdmull2_16(src1, src2) #define vqdmull_high_s32(src1, src2) neon_sqdmull2_32(src1, src2) #define vqdmull_lane_s16(src1, src2, src3) neon_sqdmull_i16(src1, src2, src3) #define vqdmull_lane_s32(src1, src2, src3) neon_sqdmull_i32(src1, src2, src3) #define vqdmull_high_lane_s16(src1, src2, src3) neon_sqdmull2_i16(src1, src2, src3) #define vqdmull_high_lane_s32(src1, src2, src3) neon_sqdmull2_i32(src1, src2, src3) #define vqdmull_laneq_s16(src1, src2, src3) neon_sqdmull_qi16(src1, src2, src3) #define vqdmull_laneq_s32(src1, src2, src3) neon_sqdmull_qi32(src1, src2, src3) #define vqdmull_high_laneq_s16(src1, src2, src3) neon_sqdmull2_qi16(src1, src2, src3) #define vqdmull_high_laneq_s32(src1, src2, src3) neon_sqdmull2_qi32(src1, src2, src3) #define vqdmull_n_s16(src1, src2) vqdmull_lane_s16(src1, vmov_n_s16(src2), 0) #define vqdmull_n_s32(src1, src2) vqdmull_lane_s32(src1, vmov_n_s32(src2), 0) #define vqdmull_high_n_s16(src1, src2) vqdmull_high_lane_s16(src1, vmov_n_s16(src2), 0) #define vqdmull_high_n_s32(src1, src2) vqdmull_high_lane_s32(src1, vmov_n_s32(src2), 0) float neon_sqdmullh_16(__n16, __n16); __n64 neon_sqdmulls_32(float, float); float neon_sqdmullh_i16(__n16, __n64, const int); __n64 neon_sqdmulls_i32(float, __n64, const int); float neon_sqdmullh_qi16(__n16, __n128, const int); __n64 neon_sqdmulls_qi32(float, __n128, const int); __n128 neon_sqdmlal_16(__n128, __n64, __n64); __n128 neon_sqdmlal_32(__n128, __n64, __n64); __n128 neon_sqdmlal2_16(__n128, __n128, __n128); __n128 neon_sqdmlal2_32(__n128, __n128, __n128); __n128 neon_sqdmlal_i16(__n128, __n64, __n64, const int); __n128 neon_sqdmlal_i32(__n128, __n64, __n64, const int); __n128 neon_sqdmlal2_i16(__n128, __n128, __n64, const int); __n128 neon_sqdmlal2_i32(__n128, __n128, __n64, const int); __n128 neon_sqdmlal_qi16(__n128, __n64, __n128, const int); __n128 neon_sqdmlal_qi32(__n128, __n64, __n128, const int); __n128 neon_sqdmlal2_qi16(__n128, __n128, __n128, const int); __n128 neon_sqdmlal2_qi32(__n128, __n128, __n128, const int); #define vqdmullh_s16(src1, src2) neon_sqdmullh_16(src1, src2) #define vqdmulls_s32(src1, src2) neon_sqdmulls_32(src1, src2) #define vqdmullh_lane_s16(src1, src2, src3) neon_sqdmullh_i16(src1, src2, src3) #define vqdmulls_lane_s32(src1, src2, src3) neon_sqdmulls_i32(src1, src2, src3) #define vqdmullh_laneq_s16(src1, src2, src3) neon_sqdmullh_qi16(src1, src2, src3) #define vqdmulls_laneq_s32(src1, src2, src3) neon_sqdmulls_qi32(src1, src2, src3) #define vqdmlal_s16(src1, src2, src3) neon_sqdmlal_16(src1, src2, src3) #define vqdmlal_s32(src1, src2, src3) neon_sqdmlal_32(src1, src2, src3) #define vqdmlal_high_s16(src1, src2, src3) neon_sqdmlal2_16(src1, src2, src3) #define vqdmlal_high_s32(src1, src2, src3) neon_sqdmlal2_32(src1, src2, src3) #define vqdmlal_lane_s16(src1, src2, src3, src4) neon_sqdmlal_i16(src1, src2, src3, src4) #define vqdmlal_lane_s32(src1, src2, src3, src4) neon_sqdmlal_i32(src1, src2, src3, src4) #define vqdmlal_high_lane_s16(src1, src2, src3, src4) neon_sqdmlal2_i16(src1, src2, src3, src4) #define vqdmlal_high_lane_s32(src1, src2, src3, src4) neon_sqdmlal2_i32(src1, src2, src3, src4) #define vqdmlal_laneq_s16(src1, src2, src3, src4) neon_sqdmlal_qi16(src1, src2, src3, src4) #define vqdmlal_laneq_s32(src1, src2, src3, src4) neon_sqdmlal_qi32(src1, src2, src3, src4) #define vqdmlal_high_laneq_s16(src1, src2, src3, src4) neon_sqdmlal2_qi16(src1, src2, src3, src4) #define vqdmlal_high_laneq_s32(src1, src2, src3, src4) neon_sqdmlal2_qi32(src1, src2, src3, src4) #define vqdmlal_n_s16(src1, src2, src3) vqdmlal_lane_s16(src1, src2, vmov_n_s16(src3), 0) #define vqdmlal_n_s32(src1, src2, src3) vqdmlal_lane_s32(src1, src2, vmov_n_s32(src3), 0) #define vqdmlal_high_n_s16(src1, src2, src3) vqdmlal_high_lane_s16(src1, src2, vmov_n_s16(src3), 0) #define vqdmlal_high_n_s32(src1, src2, src3) vqdmlal_high_lane_s32(src1, src2, vmov_n_s32(src3), 0) float neon_sqdmlalh_16(float, __n16, __n16); __n64 neon_sqdmlals_32(__n64, float, float); float neon_sqdmlalh_i16(float, __n16, __n64, const int); __n64 neon_sqdmlals_i32(__n64, float, __n64, const int); float neon_sqdmlalh_qi16(float, __n16, __n128, const int); __n64 neon_sqdmlals_qi32(__n64, float, __n128, const int); __n128 neon_sqdmlsl_16(__n128, __n64, __n64); __n128 neon_sqdmlsl_32(__n128, __n64, __n64); __n128 neon_sqdmlsl2_16(__n128, __n128, __n128); __n128 neon_sqdmlsl2_32(__n128, __n128, __n128); __n128 neon_sqdmlsl_i16(__n128, __n64, __n64, const int); __n128 neon_sqdmlsl_i32(__n128, __n64, __n64, const int); __n128 neon_sqdmlsl2_i16(__n128, __n128, __n64, const int); __n128 neon_sqdmlsl2_i32(__n128, __n128, __n64, const int); __n128 neon_sqdmlsl_qi16(__n128, __n64, __n128, const int); __n128 neon_sqdmlsl_qi32(__n128, __n64, __n128, const int); __n128 neon_sqdmlsl2_qi16(__n128, __n128, __n128, const int); __n128 neon_sqdmlsl2_qi32(__n128, __n128, __n128, const int); #define vqdmlalh_s16(src1, src2, src3) neon_sqdmlalh_16(src1, src2, src3) #define vqdmlals_s32(src1, src2, src3) neon_sqdmlals_32(src1, src2, src3) #define vqdmlalh_lane_s16(src1, src2, src3, src4) neon_sqdmlalh_i16(src1, src2, src3, src4) #define vqdmlals_lane_s32(src1, src2, src3, src4) neon_sqdmlals_i32(src1, src2, src3, src4) #define vqdmlalh_laneq_s16(src1, src2, src3, src4) neon_sqdmlalh_qi16(src1, src2, src3, src4) #define vqdmlals_laneq_s32(src1, src2, src3, src4) neon_sqdmlals_qi32(src1, src2, src3, src4) #define vqdmlsl_s16(src1, src2, src3) neon_sqdmlsl_16(src1, src2, src3) #define vqdmlsl_s32(src1, src2, src3) neon_sqdmlsl_32(src1, src2, src3) #define vqdmlsl_high_s16(src1, src2, src3) neon_sqdmlsl2_16(src1, src2, src3) #define vqdmlsl_high_s32(src1, src2, src3) neon_sqdmlsl2_32(src1, src2, src3) #define vqdmlsl_lane_s16(src1, src2, src3, src4) neon_sqdmlsl_i16(src1, src2, src3, src4) #define vqdmlsl_lane_s32(src1, src2, src3, src4) neon_sqdmlsl_i32(src1, src2, src3, src4) #define vqdmlsl_high_lane_s16(src1, src2, src3, src4) neon_sqdmlsl2_i16(src1, src2, src3, src4) #define vqdmlsl_high_lane_s32(src1, src2, src3, src4) neon_sqdmlsl2_i32(src1, src2, src3, src4) #define vqdmlsl_laneq_s16(src1, src2, src3, src4) neon_sqdmlsl_qi16(src1, src2, src3, src4) #define vqdmlsl_laneq_s32(src1, src2, src3, src4) neon_sqdmlsl_qi32(src1, src2, src3, src4) #define vqdmlsl_high_laneq_s16(src1, src2, src3, src4) neon_sqdmlsl2_qi16(src1, src2, src3, src4) #define vqdmlsl_high_laneq_s32(src1, src2, src3, src4) neon_sqdmlsl2_qi32(src1, src2, src3, src4) #define vqdmlsl_n_s16(src1, src2, src3) vqdmlsl_lane_s16(src1, src2, vmov_n_s16(src3), 0) #define vqdmlsl_n_s32(src1, src2, src3) vqdmlsl_lane_s32(src1, src2, vmov_n_s32(src3), 0) #define vqdmlsl_high_n_s16(src1, src2, src3) vqdmlsl_high_lane_s16(src1, src2, vmov_n_s16(src3), 0) #define vqdmlsl_high_n_s32(src1, src2, src3) vqdmlsl_high_lane_s32(src1, src2, vmov_n_s32(src3), 0) float neon_sqdmlslh_16(float, __n16, __n16); __n64 neon_sqdmlsls_32(__n64, float, float); float neon_sqdmlslh_i16(float, __n16, __n64, const int); __n64 neon_sqdmlsls_i32(__n64, float, __n64, const int); float neon_sqdmlslh_qi16(float, __n16, __n128, const int); __n64 neon_sqdmlsls_qi32(__n64, float, __n128, const int); #define vqdmlslh_s16(src1, src2, src3) neon_sqdmlslh_16(src1, src2, src3) #define vqdmlsls_s32(src1, src2, src3) neon_sqdmlsls_32(src1, src2, src3) #define vqdmlslh_lane_s16(src1, src2, src3, src) neon_sqdmlslh_i16(src1, src2, src3, src) #define vqdmlsls_lane_s32(src1, src2, src3, src) neon_sqdmlsls_i32(src1, src2, src3, src) #define vqdmlslh_laneq_s16(src1, src2, src3, src) neon_sqdmlslh_qi16(src1, src2, src3, src) #define vqdmlsls_laneq_s32(src1, src2, src3, src) neon_sqdmlsls_qi32(src1, src2, src3, src) // CMEQ/CMGE/CMGT/CMHI/CMHS/CMLE/CMLT/CMTST/FACGE/FACGT/FCMEQ/FCMGE/FCMGT/FCMLE/FCMLT/ __n64 neon_facge32(__n64, __n64); __n128 neon_facgeq32(__n128, __n128); __n128 neon_facgeq64(__n128, __n128); float neon_facges32(float, float); double neon_facges64(double, double); __n64 neon_facgt32(__n64, __n64); __n128 neon_facgtq32(__n128, __n128); __n128 neon_facgtq64(__n128, __n128); float neon_facgts32(float, float); double neon_facgts64(double, double); __n64 neon_fcmeq32(__n64, __n64); __n128 neon_fcmeqq32(__n128, __n128); __n128 neon_fcmeqq64(__n128, __n128); __n64 neon_fcmeqz32(__n64); __n128 neon_fcmeqzq32(__n128); __n128 neon_fcmeqzq64(__n128); float neon_fcmeqs32(float, float); double neon_fcmeqs64(double, double); float neon_fcmeqzs32(float); double neon_fcmeqzs64(double); __n64 neon_fcmge32(__n64, __n64); __n128 neon_fcmgeq32(__n128, __n128); __n128 neon_fcmgeq64(__n128, __n128); __n64 neon_fcmgez32(__n64); __n128 neon_fcmgezq32(__n128); __n128 neon_fcmgezq64(__n128); float neon_fcmges32(float, float); double neon_fcmges64(double, double); float neon_fcmgezs32(float); double neon_fcmgezs64(double); __n64 neon_fcmgt32(__n64, __n64); __n128 neon_fcmgtq32(__n128, __n128); __n128 neon_fcmgtq64(__n128, __n128); __n64 neon_fcmgtz32(__n64); __n128 neon_fcmgtzq32(__n128); __n128 neon_fcmgtzq64(__n128); float neon_fcmgts32(float, float); double neon_fcmgts64(double, double); float neon_fcmgtzs32(float); double neon_fcmgtzs64(double); __n64 neon_fcmlez32(__n64); __n128 neon_fcmlezq32(__n128); __n128 neon_fcmlezq64(__n128); float neon_fcmlezs32(float); double neon_fcmlezs64(double); __n64 neon_fcmltz32(__n64); __n128 neon_fcmltzq32(__n128); __n128 neon_fcmltzq64(__n128); float neon_fcmltzs32(float); double neon_fcmltzs64(double); __n64 neon_cmeq8(__n64, __n64); __n128 neon_cmeqq8(__n128, __n128); __n64 neon_cmeq16(__n64, __n64); __n128 neon_cmeqq16(__n128, __n128); __n64 neon_cmeq32(__n64, __n64); __n128 neon_cmeqq32(__n128, __n128); __n128 neon_cmeqq64(__n128, __n128); __n64 neon_cmeqz8(__n64); __n128 neon_cmeqzq8(__n128); __n64 neon_cmeqz16(__n64); __n128 neon_cmeqzq16(__n128); __n64 neon_cmeqz32(__n64); __n128 neon_cmeqzq32(__n128); __n128 neon_cmeqzq64(__n128); double neon_cmeqs64(double, double); double neon_cmeqzs64(double); __n64 neon_cmge8(__n64, __n64); __n128 neon_cmgeq8(__n128, __n128); __n64 neon_cmge16(__n64, __n64); __n128 neon_cmgeq16(__n128, __n128); __n64 neon_cmge32(__n64, __n64); __n128 neon_cmgeq32(__n128, __n128); __n128 neon_cmgeq64(__n128, __n128); __n64 neon_cmgez8(__n64); __n128 neon_cmgezq8(__n128); __n64 neon_cmgez16(__n64); __n128 neon_cmgezq16(__n128); __n64 neon_cmgez32(__n64); __n128 neon_cmgezq32(__n128); __n128 neon_cmgezq64(__n128); double neon_cmges64(double, double); double neon_cmgezs64(double); __n64 neon_cmgt8(__n64, __n64); __n128 neon_cmgtq8(__n128, __n128); __n64 neon_cmgt16(__n64, __n64); __n128 neon_cmgtq16(__n128, __n128); __n64 neon_cmgt32(__n64, __n64); __n128 neon_cmgtq32(__n128, __n128); __n128 neon_cmgtq64(__n128, __n128); __n64 neon_cmgtz8(__n64); __n128 neon_cmgtzq8(__n128); __n64 neon_cmgtz16(__n64); __n128 neon_cmgtzq16(__n128); __n64 neon_cmgtz32(__n64); __n128 neon_cmgtzq32(__n128); __n128 neon_cmgtzq64(__n128); double neon_cmgts64(double, double); double neon_cmgtzs64(double); __n64 neon_cmhi8(__n64, __n64); __n128 neon_cmhiq8(__n128, __n128); __n64 neon_cmhi16(__n64, __n64); __n128 neon_cmhiq16(__n128, __n128); __n64 neon_cmhi32(__n64, __n64); __n128 neon_cmhiq32(__n128, __n128); __n128 neon_cmhiq64(__n128, __n128); double neon_cmhis64(double, double); __n64 neon_cmhs8(__n64, __n64); __n128 neon_cmhsq8(__n128, __n128); __n64 neon_cmhs16(__n64, __n64); __n128 neon_cmhsq16(__n128, __n128); __n64 neon_cmhs32(__n64, __n64); __n128 neon_cmhsq32(__n128, __n128); __n128 neon_cmhsq64(__n128, __n128); double neon_cmhss64(double, double); __n64 neon_cmlez8(__n64); __n128 neon_cmlezq8(__n128); __n64 neon_cmlez16(__n64); __n128 neon_cmlezq16(__n128); __n64 neon_cmlez32(__n64); __n128 neon_cmlezq32(__n128); __n128 neon_cmlezq64(__n128); double neon_cmlezs64(double); __n64 neon_cmltz8(__n64); __n128 neon_cmltzq8(__n128); __n64 neon_cmltz16(__n64); __n128 neon_cmltzq16(__n128); __n64 neon_cmltz32(__n64); __n128 neon_cmltzq32(__n128); __n128 neon_cmltzq64(__n128); double neon_cmltzs64(double); __n64 neon_cmtst8(__n64, __n64); __n128 neon_cmtstq8(__n128, __n128); __n64 neon_cmtst16(__n64, __n64); __n128 neon_cmtstq16(__n128, __n128); __n64 neon_cmtst32(__n64, __n64); __n128 neon_cmtstq32(__n128, __n128); __n128 neon_cmtstq64(__n128, __n128); double neon_cmtsts64(double, double); #define vceq_z_f32_ex(src) neon_fcmeqz32(src) #define vceq_z_s16_ex(src) neon_cmeqz16(src) #define vceq_z_s32_ex(src) neon_cmeqz32(src) #define vceq_z_s8_ex(src) neon_cmeqz8(src) #define vceq_z_u16_ex(src) neon_cmeqz16(src) #define vceq_z_u32_ex(src) neon_cmeqz32(src) #define vceq_z_u8_ex(src) neon_cmeqz8(src) #define vceqq_z_f32_ex(src) neon_fcmeqzq32(src) #define vceqq_z_s16_ex(src) neon_cmeqzq16(src) #define vceqq_z_s32_ex(src) neon_cmeqzq32(src) #define vceqq_z_s8_ex(src) neon_cmeqzq8(src) #define vceqq_z_u16_ex(src) neon_cmeqzq16(src) #define vceqq_z_u32_ex(src) neon_cmeqzq32(src) #define vceqq_z_u8_ex(src) neon_cmeqzq8(src) #define vceq_f32(src1, src2) neon_fcmeq32(src1, src2) #define vceqz_u8(src) neon_cmeqz8(src) #define vceq_p8(src1, src2) neon_cmeq8(src1, src2) #define vceq_s16(src1, src2) neon_cmeq16(src1, src2) #define vceq_s32(src1, src2) neon_cmeq32(src1, src2) #define vceq_s8(src1, src2) neon_cmeq8(src1, src2) #define vceq_u16(src1, src2) neon_cmeq16(src1, src2) #define vceq_u32(src1, src2) neon_cmeq32(src1, src2) #define vceq_u8(src1, src2) neon_cmeq8(src1, src2) #define vceqq_f32(src1, src2) neon_fcmeqq32(src1, src2) #define vceqq_p8(src1, src2) neon_cmeqq8(src1, src2) #define vceqq_s16(src1, src2) neon_cmeqq16(src1, src2) #define vceqq_s32(src1, src2) neon_cmeqq32(src1, src2) #define vceqq_s8(src1, src2) neon_cmeqq8(src1, src2) #define vceqq_u16(src1, src2) neon_cmeqq16(src1, src2) #define vceqq_u32(src1, src2) neon_cmeqq32(src1, src2) #define vceqq_u8(src1, src2) neon_cmeqq8(src1, src2) #define vcge_z_f32_ex(src) neon_fcmgez32(src) #define vcge_z_s16_ex(src) neon_cmgez16(src) #define vcge_z_s32_ex(src) neon_cmgez32(src) #define vcge_z_s8_ex(src) neon_cmgez8(src) #define vcge_z_u16_ex(src) neon_cmgez16(src) #define vcge_z_u32_ex(src) neon_cmgez32(src) #define vcge_z_u8_ex(src) neon_cmgez8(src) #define vcgeq_z_f32_ex(src) neon_fcmgezq32(src) #define vcgeq_z_s16_ex(src) neon_cmgezq16(src) #define vcgeq_z_s32_ex(src) neon_cmgezq32(src) #define vcgeq_z_s8_ex(src) neon_cmgezq8(src) #define vcgeq_z_u16_ex(src) neon_cmgezq16(src) #define vcgeq_z_u32_ex(src) neon_cmgezq32(src) #define vcgeq_z_u8_ex(src) neon_cmgezq8(src) #define vcge_f32(src1, src2) neon_fcmge32(src1, src2) #define vcge_p8(src1, src2) neon_cmge8(src1, src2) #define vcge_s16(src1, src2) neon_cmge16(src1, src2) #define vcge_s32(src1, src2) neon_cmge32(src1, src2) #define vcge_s8(src1, src2) neon_cmge8(src1, src2) #define vcge_u16(src1, src2) neon_cmge16(src1, src2) #define vcge_u32(src1, src2) neon_cmge32(src1, src2) #define vcge_u8(src1, src2) neon_cmge8(src1, src2) #define vcgeq_f32(src1, src2) neon_fcmgeq32(src1, src2) #define vcgeq_p8(src1, src2) neon_cmgeq8(src1, src2) #define vcgeq_s16(src1, src2) neon_cmgeq16(src1, src2) #define vcgeq_s32(src1, src2) neon_cmgeq32(src1, src2) #define vcgeq_s8(src1, src2) neon_cmgeq8(src1, src2) #define vcgeq_u16(src1, src2) neon_cmgeq16(src1, src2) #define vcgeq_u32(src1, src2) neon_cmgeq32(src1, src2) #define vcgeq_u8(src1, src2) neon_cmgeq8(src1, src2) #define vcle_z_f32_ex(src) neon_fcmlez32(src) #define vcle_z_s16_ex(src) neon_cmlez16(src) #define vcle_z_s32_ex(src) neon_cmlez32(src) #define vcle_z_s8_ex(src) neon_cmlez8(src) #define vcleq_z_f32_ex(src) neon_fcmlezq32(src) #define vcleq_z_s16_ex(src) neon_cmlezq16(src) #define vcleq_z_s32_ex(src) neon_cmlezq32(src) #define vcleq_z_s8_ex(src) neon_cmlezq8(src) // vcle register form is alias with vcge with reversed operands #define vcle_f32(src1, src2) neon_fcmge32(src2, src1) #define vcle_p8(src1, src2) neon_cmge8(src2, src1) #define vcle_s16(src1, src2) neon_cmge16(src2, src1) #define vcle_s32(src1, src2) neon_cmge32(src2, src1) #define vcle_s8(src1, src2) neon_cmge8(src2, src1) #define vcle_u16(src1, src2) neon_cmge16(src2, src1) #define vcle_u32(src1, src2) neon_cmge32(src2, src1) #define vcle_u8(src1, src2) neon_cmge8(src2, src1) #define vcleq_f32(src1, src2) neon_fcmgeq32(src2, src1) #define vcleq_p8(src1, src2) neon_cmgeq8(src2, src1) #define vcleq_s16(src1, src2) neon_cmgeq16(src2, src1) #define vcleq_s32(src1, src2) neon_cmgeq32(src2, src1) #define vcleq_s8(src1, src2) neon_cmgeq8(src2, src1) #define vcleq_u16(src1, src2) neon_cmgeq16(src2, src1) #define vcleq_u32(src1, src2) neon_cmgeq32(src2, src1) #define vcleq_u8(src1, src2) neon_cmgeq8(src2, src1) #define vcgt_z_f32_ex(src) neon_fcmgtz32(src) #define vcgt_z_s16_ex(src) neon_cmgtz16(src) #define vcgt_z_s32_ex(src) neon_cmgtz32(src) #define vcgt_z_s8_ex(src) neon_cmgtz8(src) #define vcgt_z_u16_ex(src) neon_cmgtz16(src) #define vcgt_z_u32_ex(src) neon_cmgtz32(src) #define vcgt_z_u8_ex(src) neon_cmgtz8(src) #define vcgtq_z_f32_ex(src) neon_fcmgtzq32(src) #define vcgtq_z_s16_ex(src) neon_cmgtzq16(src) #define vcgtq_z_s32_ex(src) neon_cmgtzq32(src) #define vcgtq_z_s8_ex(src) neon_cmgtzq8(src) #define vcgtq_z_u16_ex(src) neon_cmgtzq16(src) #define vcgtq_z_u32_ex(src) neon_cmgtzq32(src) #define vcgtq_z_u8_ex(src) neon_cmgtzq8(src) #define vcgt_f32(src1, src2) neon_fcmgt32(src1, src2) #define vcgt_p8(src1, src2) neon_cmgt8(src1, src2) #define vcgt_s16(src1, src2) neon_cmgt16(src1, src2) #define vcgt_s32(src1, src2) neon_cmgt32(src1, src2) #define vcgt_s8(src1, src2) neon_cmgt8(src1, src2) #define vcgt_u16(src1, src2) neon_cmgt16(src1, src2) #define vcgt_u32(src1, src2) neon_cmgt32(src1, src2) #define vcgt_u8(src1, src2) neon_cmgt8(src1, src2) #define vcgtq_f32(src1, src2) neon_fcmgtq32(src1, src2) #define vcgtq_p8(src1, src2) neon_cmgtq8(src1, src2) #define vcgtq_s16(src1, src2) neon_cmgtq16(src1, src2) #define vcgtq_s32(src1, src2) neon_cmgtq32(src1, src2) #define vcgtq_s8(src1, src2) neon_cmgtq8(src1, src2) #define vcgtq_u16(src1, src2) neon_cmgtq16(src1, src2) #define vcgtq_u32(src1, src2) neon_cmgtq32(src1, src2) #define vcgtq_u8(src1, src2) neon_cmgtq8(src1, src2) #define vclt_z_f32_ex(src) neon_fcmltz32(src) #define vclt_z_s16_ex(src) neon_cmltz16(src) #define vclt_z_s32_ex(src) neon_cmltz32(src) #define vclt_z_s8_ex(src) neon_cmltz8(src) #define vcltq_z_f32_ex(src) neon_fcmltzq32(src) #define vcltq_z_s16_ex(src) neon_cmltzq16(src) #define vcltq_z_s32_ex(src) neon_cmltzq32(src) #define vcltq_z_s8_ex(src) neon_cmltzq8(src) // vclt register form is alias with vcgt with reversed operands #define vclt_f32(src1, src2) neon_fcmgt32(src2, src1) #define vclt_p8(src1, src2) neon_cmgt8(src2, src1) #define vclt_s16(src1, src2) neon_cmgt16(src2, src1) #define vclt_s32(src1, src2) neon_cmgt32(src2, src1) #define vclt_s8(src1, src2) neon_cmgt8(src2, src1) #define vclt_u16(src1, src2) neon_cmgt16(src2, src1) #define vclt_u32(src1, src2) neon_cmgt32(src2, src1) #define vclt_u8(src1, src2) neon_cmgt8(src2, src1) #define vcltq_f32(src1, src2) neon_fcmgtq32(src2, src1) #define vcltq_p8(src1, src2) neon_cmgtq8(src2, src1) #define vcltq_s16(src1, src2) neon_cmgtq16(src2, src1) #define vcltq_s32(src1, src2) neon_cmgtq32(src2, src1) #define vcltq_s8(src1, src2) neon_cmgtq8(src2, src1) #define vcltq_u16(src1, src2) neon_cmgtq16(src2, src1) #define vcltq_u32(src1, src2) neon_cmgtq32(src2, src1) #define vcltq_u8(src1, src2) neon_cmgtq8(src2, src1) #define vacge_f32(src1, src2) neon_facge32(src1, src2) #define vacgt_f32(src1, src2) neon_facgt32(src1, src2) // vacle register form is alias with vacge with operands reversed #define vacle_f32(src1, src2) neon_facge32(src2, src1) #define vaclt_f32(src1, src2) neon_facgt32(src2, src1) #define vacgeq_f32(src1, src2) neon_facgeq32(src1, src2) #define vacgtq_f32(src1, src2) neon_facgtq32(src1, src2) #define vacleq_f32(src1, src2) neon_facgeq32(src2, src1) #define vacltq_f32(src1, src2) neon_facgtq32(src2, src1) // FCVTAS/FCVTAU/FCVTMS/FCVTMU/FCVTNS/FCVTPS/FCVTPU/FCVTZS/FCVTZU/SCVTF/UCVTF __n64 neon_fcvtas32(__n64); __n128 neon_fcvtasq32(__n128); __n128 neon_fcvtasq64(__n128); float neon_fcvtass32(float); double neon_fcvtass64(double); __n64 neon_fcvtau32(__n64); __n128 neon_fcvtauq32(__n128); __n128 neon_fcvtauq64(__n128); float neon_fcvtaus32(float); double neon_fcvtaus64(double); __n64 neon_fcvtms32(__n64); __n128 neon_fcvtmsq32(__n128); __n128 neon_fcvtmsq64(__n128); float neon_fcvtmss32(float); double neon_fcvtmss64(double); __n64 neon_fcvtmu32(__n64); __n128 neon_fcvtmuq32(__n128); __n128 neon_fcvtmuq64(__n128); float neon_fcvtmus32(float); double neon_fcvtmus64(double); __n64 neon_fcvtns32(__n64); __n128 neon_fcvtnsq32(__n128); __n128 neon_fcvtnsq64(__n128); float neon_fcvtnss32(float); double neon_fcvtnss64(double); __n64 neon_fcvtnu32(__n64); __n128 neon_fcvtnuq32(__n128); __n128 neon_fcvtnuq64(__n128); float neon_fcvtnus32(float); double neon_fcvtnus64(double); __n64 neon_fcvtps32(__n64); __n128 neon_fcvtpsq32(__n128); __n128 neon_fcvtpsq64(__n128); float neon_fcvtpss32(float); double neon_fcvtpss64(double); __n64 neon_fcvtpu32(__n64); __n128 neon_fcvtpuq32(__n128); __n128 neon_fcvtpuq64(__n128); float neon_fcvtpus32(float); double neon_fcvtpus64(double); __n64 neon_fcvtzs32(__n64); __n128 neon_fcvtzsq32(__n128); __n128 neon_fcvtzsq64(__n128); float neon_fcvtzss32(float); double neon_fcvtzss64(double); __n64 neon_fcvtzu32(__n64); __n128 neon_fcvtzuq32(__n128); __n128 neon_fcvtzuq64(__n128); float neon_fcvtzus32(float); double neon_fcvtzus64(double); __n64 neon_scvtf32(__n64); __n128 neon_scvtfq32(__n128); __n128 neon_scvtfq64(__n128); float neon_scvtfs32(float); double neon_scvtfs64(double); __n64 neon_ucvtf32(__n64); __n128 neon_ucvtfq32(__n128); __n128 neon_ucvtfq64(__n128); float neon_ucvtfs32(float); double neon_ucvtfs64(double); __n64 neon_fcvtzsfp32(__n64, const int); __n128 neon_fcvtzsfpq32(__n128, const int); __n128 neon_fcvtzsfpq64(__n128, const int); float neon_fcvtzsfps32(float, const int); double neon_fcvtzsfps64(double, const int); __n64 neon_fcvtzufp32(__n64, const int); __n128 neon_fcvtzufpq32(__n128, const int); __n128 neon_fcvtzufpq64(__n128, const int); float neon_fcvtzufps32(float, const int); double neon_fcvtzufps64(double, const int); __n64 neon_scvtffp32(__n64, const int); __n128 neon_scvtffpq32(__n128, const int); __n128 neon_scvtffpq64(__n128, const int); float neon_scvtffps32(float, const int); double neon_scvtffps64(double, const int); __n64 neon_ucvtffp32(__n64, const int); __n128 neon_ucvtffpq32(__n128, const int); __n128 neon_ucvtffpq64(__n128, const int); float neon_ucvtffps32(float, const int); double neon_ucvtffps64(double, const int); #define vcvt_n_f32_s32(src1, src2) neon_scvtffp32(src1, src2) #define vcvt_n_f32_u32(src1, src2) neon_ucvtffp32(src1, src2) #define vcvt_n_s32_f32(src1, src2) neon_fcvtzsfp32(src1, src2) #define vcvt_n_u32_f32(src1, src2) neon_fcvtzufp32(src1, src2) #define vcvtq_n_f32_s32(src1, src2) neon_scvtffpq32(src1, src2) #define vcvtq_n_f32_u32(src1, src2) neon_ucvtffpq32(src1, src2) #define vcvtq_n_s32_f32(src1, src2) neon_fcvtzsfpq32(src1, src2) #define vcvtq_n_u32_f32(src1, src2) neon_fcvtzufpq32(src1, src2) #define vcvta_s32_f32(src) neon_fcvtas32(src) #define vcvta_u32_f32(src) neon_fcvtau32(src) #define vcvtm_s32_f32(src) neon_fcvtms32(src) #define vcvtm_u32_f32(src) neon_fcvtmu32(src) #define vcvtn_s32_f32(src) neon_fcvtns32(src) #define vcvtn_u32_f32(src) neon_fcvtnu32(src) #define vcvtp_s32_f32(src) neon_fcvtps32(src) #define vcvtp_u32_f32(src) neon_fcvtpu32(src) #define vcvtaq_s32_f32(src) neon_fcvtasq32(src) #define vcvtaq_u32_f32(src) neon_fcvtauq32(src) #define vcvtmq_s32_f32(src) neon_fcvtmsq32(src) #define vcvtmq_u32_f32(src) neon_fcvtmuq32(src) #define vcvtnq_s32_f32(src) neon_fcvtnsq32(src) #define vcvtnq_u32_f32(src) neon_fcvtnuq32(src) #define vcvtpq_s32_f32(src) neon_fcvtpsq32(src) #define vcvtpq_u32_f32(src) neon_fcvtpuq32(src) #define vcvt_f32_s32(src) neon_scvtf32(src) #define vcvt_f32_u32(src) neon_ucvtf32(src) #define vcvt_s32_f32(src) neon_fcvtzs32(src) #define vcvt_u32_f32(src) neon_fcvtzu32(src) #define vcvtq_f32_s32(src) neon_scvtfq32(src) #define vcvtq_f32_u32(src) neon_ucvtfq32(src) #define vcvtq_s32_f32(src) neon_fcvtzsq32(src) #define vcvtq_u32_f32(src) neon_fcvtzuq32(src) // FRECPE/FRECPS/FRECPX/URECPE __n64 neon_frecpe32 (__n64); __n128 neon_frecpeq32(__n128); __n128 neon_frecpeq64(__n128); float neon_frecpes32(float); double neon_frecpes64(double); __n64 neon_frecps32 (__n64, __n64); __n128 neon_frecpsq32(__n128, __n128); __n128 neon_frecpsq64(__n128, __n128); float neon_frecpss32(float, float); double neon_frecpss64(double, double); __n64 neon_urecpe32 (__n64); __n128 neon_urecpeq32(__n128); float neon_frecpx32(float); double neon_frecpx64(double); #define vrecpe_f32(src) neon_frecpe32(src) #define vrecpe_u32(src) neon_urecpe32(src) #define vrecpeq_f32(src) neon_frecpeq32(src) #define vrecpeq_u32(src) neon_urecpeq32(src) #define vrecps_f32(src1, src2) neon_frecps32(src1, src2) #define vrecpsq_f32(src1, src2) neon_frecpsq32(src1, src2) // ZIP1/ZIP2/UZP1/UZP2/TRN1/TRN2 __n64 neon_zip1_8(__n64 _Dd, __n64 _Dm); __n128 neon_zip1_q8(__n128 _Qd, __n128 _Qm); __n64 neon_zip1_16(__n64 _Dd, __n64 _Dm); __n128 neon_zip1_q16(__n128 _Qd, __n128 _Qm); __n64 neon_zip1_32(__n64 _Dd, __n64 _Dm); __n128 neon_zip1_q32(__n128 _Qd, __n128 _Qm); __n128 neon_zip1_q64(__n128 _Qd, __n128 _Qm); __n64 neon_zip2_8(__n64 _Dd, __n64 _Dm); __n128 neon_zip2_q8(__n128 _Qd, __n128 _Qm); __n64 neon_zip2_16(__n64 _Dd, __n64 _Dm); __n128 neon_zip2_q16(__n128 _Qd, __n128 _Qm); __n64 neon_zip2_32(__n64 _Dd, __n64 _Dm); __n128 neon_zip2_q32(__n128 _Qd, __n128 _Qm); __n128 neon_zip2_q64(__n128 _Qd, __n128 _Qm); __n64 neon_uzp1_8(__n64 _Dd, __n64 _Dm); __n128 neon_uzp1_q8(__n128 _Qd, __n128 _Qm); __n64 neon_uzp1_16(__n64 _Dd, __n64 _Dm); __n128 neon_uzp1_q16(__n128 _Qd, __n128 _Qm); __n64 neon_uzp1_32(__n64 _Dd, __n64 _Dm); __n128 neon_uzp1_q32(__n128 _Qd, __n128 _Qm); __n128 neon_uzp1_q64(__n128 _Qd, __n128 _Qm); __n64 neon_uzp2_8(__n64 _Dd, __n64 _Dm); __n128 neon_uzp2_q8(__n128 _Qd, __n128 _Qm); __n64 neon_uzp2_16(__n64 _Dd, __n64 _Dm); __n128 neon_uzp2_q16(__n128 _Qd, __n128 _Qm); __n64 neon_uzp2_32(__n64 _Dd, __n64 _Dm); __n128 neon_uzp2_q32(__n128 _Qd, __n128 _Qm); __n128 neon_uzp2_q64(__n128 _Qd, __n128 _Qm); __n64 neon_trn1_8(__n64 _Dd, __n64 _Dm); __n128 neon_trn1_q8(__n128 _Qd, __n128 _Qm); __n64 neon_trn1_16(__n64 _Dd, __n64 _Dm); __n128 neon_trn1_q16(__n128 _Qd, __n128 _Qm); __n64 neon_trn1_32(__n64 _Dd, __n64 _Dm); __n128 neon_trn1_q32(__n128 _Qd, __n128 _Qm); __n128 neon_trn1_q64(__n128 _Qd, __n128 _Qm); __n64 neon_trn2_8(__n64 _Dd, __n64 _Dm); __n128 neon_trn2_q8(__n128 _Qd, __n128 _Qm); __n64 neon_trn2_16(__n64 _Dd, __n64 _Dm); __n128 neon_trn2_q16(__n128 _Qd, __n128 _Qm); __n64 neon_trn2_32(__n64 _Dd, __n64 _Dm); __n128 neon_trn2_q32(__n128 _Qd, __n128 _Qm); __n128 neon_trn2_q64(__n128 _Qd, __n128 _Qm); __n64x2 neon_zip_8(__n64 _Dd, __n64 _Dm); __n128x2 neon_zip_q8(__n128 _Qd, __n128 _Qm); __n64x2 neon_zip_16(__n64 _Dd, __n64 _Dm); __n128x2 neon_zip_q16(__n128 _Qd, __n128 _Qm); __n64x2 neon_zip_32(__n64 _Dd, __n64 _Dm); __n128x2 neon_zip_q32(__n128 _Qd, __n128 _Qm); __n128x2 neon_zip_q64(__n128 _Qd, __n128 _Qm); __n64x2 neon_uzp_8(__n64 _Dd, __n64 _Dm); __n128x2 neon_uzp_q8(__n128 _Qd, __n128 _Qm); __n64x2 neon_uzp_16(__n64 _Dd, __n64 _Dm); __n128x2 neon_uzp_q16(__n128 _Qd, __n128 _Qm); __n64x2 neon_uzp_32(__n64 _Dd, __n64 _Dm); __n128x2 neon_uzp_q32(__n128 _Qd, __n128 _Qm); __n128x2 neon_uzp_q64(__n128 _Qd, __n128 _Qm); __n64x2 neon_trn_8(__n64 _Dd, __n64 _Dm); __n128x2 neon_trn_q8(__n128 _Qd, __n128 _Qm); __n64x2 neon_trn_16(__n64 _Dd, __n64 _Dm); __n128x2 neon_trn_q16(__n128 _Qd, __n128 _Qm); __n64x2 neon_trn_32(__n64 _Dd, __n64 _Dm); __n128x2 neon_trn_q32(__n128 _Qd, __n128 _Qm); __n128x2 neon_trn_q64(__n128 _Qd, __n128 _Qm); #define vzip_p16(src1, src2) neon_zip_16(src1, src2) #define vzip_p8(src1, src2) neon_zip_8(src1, src2) #define vzip_s16(src1, src2) neon_zip_16(src1, src2) #define vzip_s8(src1, src2) neon_zip_8(src1, src2) #define vzip_u16(src1, src2) neon_zip_16(src1, src2) #define vzip_u8(src1, src2) neon_zip_8(src1, src2) #define vzip_f32(src1, src2) neon_zip_32(src1, src2) #define vzip_s32(src1, src2) neon_zip_32(src1, src2) #define vzip_u32(src1, src2) neon_zip_32(src1, src2) #define vzipq_f32(src1, src2) neon_zip_q32(src1, src2) #define vzipq_p16(src1, src2) neon_zip_q16(src1, src2) #define vzipq_p8(src1, src2) neon_zip_q8(src1, src2) #define vzipq_s16(src1, src2) neon_zip_q16(src1, src2) #define vzipq_s32(src1, src2) neon_zip_q32(src1, src2) #define vzipq_s8(src1, src2) neon_zip_q8(src1, src2) #define vzipq_u16(src1, src2) neon_zip_q16(src1, src2) #define vzipq_u32(src1, src2) neon_zip_q32(src1, src2) #define vzipq_u8(src1, src2) neon_zip_q8(src1, src2) #define vzip1_u32(src1, src2) neon_zip1_32(src1, src2) #define vzip1q_u8(src1, src2) neon_zip1_q8(src1, src2) #define vzip1q_u16(src1, src2) neon_zip1_q16(src1, src2) #define vuzp_p16(src1, src2) neon_uzp_16(src1, src2) #define vuzp_p8(src1, src2) neon_uzp_8(src1, src2) #define vuzp_s16(src1, src2) neon_uzp_16(src1, src2) #define vuzp_s8(src1, src2) neon_uzp_8(src1, src2) #define vuzp_u16(src1, src2) neon_uzp_16(src1, src2) #define vuzp_u8(src1, src2) neon_uzp_8(src1, src2) #define vuzp_f32(src1, src2) neon_uzp_32(src1, src2) #define vuzp_s32(src1, src2) neon_uzp_32(src1, src2) #define vuzp_u32(src1, src2) neon_uzp_32(src1, src2) #define vuzpq_f32(src1, src2) neon_uzp_q32(src1, src2) #define vuzpq_p16(src1, src2) neon_uzp_q16(src1, src2) #define vuzp1q_u16(src1, src2) neon_uzp1_q16(src1, src2) #define vuzp1q_u8(src1, src2) neon_uzp1_q8(src1, src2) #define vuzpq_p8(src1, src2) neon_uzp_q8(src1, src2) #define vuzpq_s16(src1, src2) neon_uzp_q16(src1, src2) #define vuzpq_s32(src1, src2) neon_uzp_q32(src1, src2) #define vuzpq_s8(src1, src2) neon_uzp_q8(src1, src2) #define vuzpq_u16(src1, src2) neon_uzp_q16(src1, src2) #define vuzpq_u32(src1, src2) neon_uzp_q32(src1, src2) #define vuzpq_u8(src1, src2) neon_uzp_q8(src1, src2) #define vuzp2_u32(src1, src2) neon_uzp2_32(src1, src2) #define vuzp2q_u16(src1, src2) neon_uzp2_q16(src1, src2) #define vtrn_p16(src1, src2) neon_trn_16(src1, src2) #define vtrn_p8(src1, src2) neon_trn_8(src1, src2) #define vtrn_s16(src1, src2) neon_trn_16(src1, src2) #define vtrn_s8(src1, src2) neon_trn_8(src1, src2) #define vtrn_u16(src1, src2) neon_trn_16(src1, src2) #define vtrn_u8(src1, src2) neon_trn_8(src1, src2) #define vtrn_f32(src1, src2) neon_trn_32(src1, src2) #define vtrn_s32(src1, src2) neon_trn_32(src1, src2) #define vtrn_u32(src1, src2) neon_trn_32(src1, src2) #define vtrnq_f32(src1, src2) neon_trn_q32(src1, src2) #define vtrnq_p16(src1, src2) neon_trn_q16(src1, src2) #define vtrnq_p8(src1, src2) neon_trn_q8(src1, src2) #define vtrnq_s16(src1, src2) neon_trn_q16(src1, src2) #define vtrnq_s32(src1, src2) neon_trn_q32(src1, src2) #define vtrnq_s8(src1, src2) neon_trn_q8(src1, src2) #define vtrnq_u16(src1, src2) neon_trn_q16(src1, src2) #define vtrnq_u32(src1, src2) neon_trn_q32(src1, src2) #define vtrnq_u8(src1, src2) neon_trn_q8(src1, src2) __n64 neon_frinta_32(__n64); __n128 neon_frinta_q32(__n128); __n128 neon_frinta_q64(__n128); __n64 neon_frinti_32(__n64); __n128 neon_frinti_q32(__n128); __n128 neon_frinti_q64(__n128); __n64 neon_frintm_32(__n64); __n128 neon_frintm_q32(__n128); __n128 neon_frintm_q64(__n128); __n64 neon_frintn_32(__n64); __n128 neon_frintn_q32(__n128); __n128 neon_frintn_q64(__n128); __n64 neon_frintp_32(__n64); __n128 neon_frintp_q32(__n128); __n128 neon_frintp_q64(__n128); __n64 neon_frintx_32(__n64); __n128 neon_frintx_q32(__n128); __n128 neon_frintx_q64(__n128); __n64 neon_frintz_32(__n64); __n128 neon_frintz_q32(__n128); __n128 neon_frintz_q64(__n128); #define vrndi_f32(src) neon_frinti_32(src) #define vrnda_f32(src) neon_frinta_32(src) #define vrndm_f32(src) neon_frintm_32(src) #define vrndn_f32(src) neon_frintn_32(src) #define vrndp_f32(src) neon_frintp_32(src) #define vrndx_f32(src) neon_frintx_32(src) #define vrndiq_f32(src) neon_frinti_q32(src) #define vrndaq_f32(src) neon_frinta_q32(src) #define vrndmq_f32(src) neon_frintm_q32(src) #define vrndnq_f32(src) neon_frintn_q32(src) #define vrndpq_f32(src) neon_frintp_q32(src) #define vrndxq_f32(src) neon_frintx_q32(src) #define vrnd_f32(src) neon_frintz_32(src) #define vrndq_f32(src) neon_frintz_q32(src) // SHA1C/SHA1M/SHA1P/SHA256H2/SHA256H/SHA1SU0/SHA256SU1/SHA1SU1/SHA256SU0/SHA1H/ __n128 neon_sha1c(__n128, __n128, __n128); __n128 neon_sha1m(__n128, __n128, __n128); __n128 neon_sha1p(__n128, __n128, __n128); __n128 neon_sha256h2(__n128, __n128, __n128); __n128 neon_sha256h(__n128, __n128, __n128); __n128 neon_sha1su0(__n128, __n128, __n128); __n128 neon_sha256su1(__n128, __n128, __n128); __n128 neon_sha1su1(__n128, __n128); __n128 neon_sha256su0(__n128, __n128); __n128 neon_sha1h(__n128); #define sha1c_f32(src1, src2, src3) neon_sha1c(src1, src2, src3) #define sha1c_s32(src1, src2, src3) neon_sha1c(src1, src2, src3) #define sha1c_u32(src1, src2, src3) neon_sha1c(src1, src2, src3) #define vsha1cq_u32(src1, src2, src3) neon_sha1c(src1, src2, src3) #define sha1p_f32(src1, src2, src3) neon_sha1p(src1, src2, src3) #define sha1p_s32(src1, src2, src3) neon_sha1p(src1, src2, src3) #define sha1p_u32(src1, src2, src3) neon_sha1p(src1, src2, src3) #define vsha1pq_u32(src1, src2, src3) neon_sha1p(src1, src2, src3) #define sha1m_f32(src1, src2, src3) neon_sha1m(src1, src2, src3) #define sha1m_s32(src1, src2, src3) neon_sha1m(src1, src2, src3) #define sha1m_u32(src1, src2, src3) neon_sha1m(src1, src2, src3) #define vsha1mq_u32(src1, src2, src3) neon_sha1m(src1, src2, src3) #define sha1su1_f32(src1, src2) neon_sha1su1(src1, src2) #define sha1su1_s32(src1, src2) neon_sha1su1(src1, src2) #define sha1su1_u32(src1, src2) neon_sha1su1(src1, src2) #define vsha1su1q_u32(src1, src2) neon_sha1su1(src1, src2) #define sha256su0_f32(src1, src2) neon_sha256su0(src1, src2) #define sha256su0_s32(src1, src2) neon_sha256su0(src1, src2) #define sha256su0_u32(src1, src2) neon_sha256su0(src1, src2) #define vsha256su0q_u32(src1, src2) neon_sha256su0(src1, src2) #define sha1su0_f32(src1, src2, src3) neon_sha1su0(src1, src2, src3) #define sha1su0_s32(src1, src2, src3) neon_sha1su0(src1, src2, src3) #define sha1su0_u32(src1, src2, src3) neon_sha1su0(src1, src2, src3) #define vsha1su0q_u32(src1, src2, src3) neon_sha1su0(src1, src2, src3) #define sha256h_f32(src1, src2, src3) neon_sha256h(src1, src2, src3) #define sha256h_s32(src1, src2, src3) neon_sha256h(src1, src2, src3) #define sha256h_u32(src1, src2, src3) neon_sha256h(src1, src2, src3) #define sha256h2_f32(src1, src2, src3) neon_sha256h2(src1, src2, src3) #define sha256h2_s32(src1, src2, src3) neon_sha256h2(src1, src2, src3) #define sha256h2_u32(src1, src2, src3) neon_sha256h2(src1, src2, src3) #define vsha256hq_u32(src1, src2, src3) neon_sha256h(src1, src2, src3) #define vsha256h2q_u32(src1, src2, src3) neon_sha256h2(src1, src2, src3) #define sha256su1_f32(src1, src2, src3) neon_sha256su1(src1, src2, src3) #define sha256su1_s32(src1, src2, src3) neon_sha256su1(src1, src2, src3) #define sha256su1_u32(src1, src2, src3) neon_sha256su1(src1, src2, src3) #define vsha256su1q_u32(src1, src2, src3) neon_sha256su1(src1, src2, src3) #define sha1h_f32(src) neon_sha1h(src) #define sha1h_s32(src) neon_sha1h(src) #define sha1h_u32(src) neon_sha1h(src) #define vsha1h_u32(src) neon_sha1h(src) // SRI/SRSHR/SSHR/SSRA/USHR/URSRA/USRA/URSHR/SRSRA/SHL/SLI/SQSHLU/SQSHL/UQSHL/SQRSHL/URSHL/SRSHL/USHL/UQRSHL/SSHL __n64 neon_srii8 (__n64, __n64, const int); __n128 neon_sriiq8 (__n128, __n128, const int); __n64 neon_srii16 (__n64, __n64, const int); __n128 neon_sriiq16(__n128, __n128, const int); __n64 neon_srii32 (__n64, __n64, const int); __n128 neon_sriiq32(__n128, __n128, const int); __n128 neon_sriiq64(__n128, __n128, const int); __n64 neon_sriis64(__n64, __n64, const int); __n64 neon_srshri8 (__n64, const int); __n128 neon_srshriq8 (__n128, const int); __n64 neon_srshri16 (__n64, const int); __n128 neon_srshriq16(__n128, const int); __n64 neon_srshri32 (__n64, const int); __n128 neon_srshriq32(__n128, const int); __n128 neon_srshriq64(__n128, const int); __n64 neon_srshris64(__n64, const int); __n64 neon_sshri8 (__n64, const int); __n128 neon_sshriq8 (__n128, const int); __n64 neon_sshri16 (__n64, const int); __n128 neon_sshriq16(__n128, const int); __n64 neon_sshri32 (__n64, const int); __n128 neon_sshriq32(__n128, const int); __n128 neon_sshriq64(__n128, const int); __n64 neon_sshris64(__n64, const int); __n64 neon_ssrai8 (__n64, __n64, const int); __n128 neon_ssraiq8 (__n128, __n128, const int); __n64 neon_ssrai16 (__n64, __n64, const int); __n128 neon_ssraiq16(__n128, __n128, const int); __n64 neon_ssrai32 (__n64, __n64, const int); __n128 neon_ssraiq32(__n128, __n128, const int); __n128 neon_ssraiq64(__n128, __n128, const int); __n64 neon_ssrais64(__n64, __n64, const int); __n64 neon_ushri8 (__n64, const int); __n128 neon_ushriq8 (__n128, const int); __n64 neon_ushri16 (__n64, const int); __n128 neon_ushriq16(__n128, const int); __n64 neon_ushri32 (__n64, const int); __n128 neon_ushriq32(__n128, const int); __n128 neon_ushriq64(__n128, const int); __n64 neon_ushris64(__n64, const int); __n64 neon_ursrai8 (__n64, __n64, const int); __n128 neon_ursraiq8 (__n128, __n128, const int); __n64 neon_ursrai16 (__n64, __n64, const int); __n128 neon_ursraiq16(__n128, __n128, const int); __n64 neon_ursrai32 (__n64, __n64, const int); __n128 neon_ursraiq32(__n128, __n128, const int); __n128 neon_ursraiq64(__n128, __n128, const int); __n64 neon_ursrais64(__n64, __n64, const int); __n64 neon_usrai8 (__n64, __n64, const int); __n128 neon_usraiq8 (__n128, __n128, const int); __n64 neon_usrai16 (__n64, __n64, const int); __n128 neon_usraiq16(__n128, __n128, const int); __n64 neon_usrai32 (__n64, __n64, const int); __n128 neon_usraiq32(__n128, __n128, const int); __n128 neon_usraiq64(__n128, __n128, const int); __n64 neon_usrais64(__n64, __n64, const int); __n64 neon_urshri8 (__n64, const int); __n128 neon_urshriq8 (__n128, const int); __n64 neon_urshri16 (__n64, const int); __n128 neon_urshriq16(__n128, const int); __n64 neon_urshri32 (__n64, const int); __n128 neon_urshriq32(__n128, const int); __n128 neon_urshriq64(__n128, const int); __n64 neon_urshris64(__n64, const int); __n64 neon_srsrai8 (__n64, __n64, const int); __n128 neon_srsraiq8 (__n128, __n128, const int); __n64 neon_srsrai16 (__n64, __n64, const int); __n128 neon_srsraiq16(__n128, __n128, const int); __n64 neon_srsrai32 (__n64, __n64, const int); __n128 neon_srsraiq32(__n128, __n128, const int); __n128 neon_srsraiq64(__n128, __n128, const int); __n64 neon_srsrais64(__n64, __n64, const int); __n64 neon_shli8 (__n64, const int); __n128 neon_shliq8 (__n128, const int); __n64 neon_shli16 (__n64, const int); __n128 neon_shliq16(__n128, const int); __n64 neon_shli32 (__n64, const int); __n128 neon_shliq32(__n128, const int); __n128 neon_shliq64(__n128, const int); __n64 neon_shlis64(__n64, const int); __n64 neon_slii8 (__n64, __n64, const int); __n128 neon_sliiq8 (__n128, __n128, const int); __n64 neon_slii16 (__n64, __n64, const int); __n128 neon_sliiq16(__n128, __n128, const int); __n64 neon_slii32 (__n64, __n64, const int); __n128 neon_sliiq32(__n128, __n128, const int); __n128 neon_sliiq64(__n128, __n128, const int); __n64 neon_sliis64(__n64, __n64, const int); __n64 neon_sqshlui8 (__n64, const int); __n128 neon_sqshluiq8 (__n128, const int); __n64 neon_sqshlui16 (__n64, const int); __n128 neon_sqshluiq16(__n128, const int); __n64 neon_sqshlui32 (__n64, const int); __n128 neon_sqshluiq32(__n128, const int); __n128 neon_sqshluiq64(__n128, const int); __n8 neon_sqshluis8(__n8, const int); __n16 neon_sqshluis16(__n16, const int); float neon_sqshluis32(float, const int); __n64 neon_sqshluis64(__n64, const int); __n64 neon_sqshli8 (__n64, const int); __n128 neon_sqshliq8 (__n128, const int); __n64 neon_sqshli16 (__n64, const int); __n128 neon_sqshliq16(__n128, const int); __n64 neon_sqshli32 (__n64, const int); __n128 neon_sqshliq32(__n128, const int); __n128 neon_sqshliq64(__n128, const int); __n64 neon_sqshl8 (__n64, __n64); __n128 neon_sqshlq8 (__n128, __n128); __n64 neon_sqshl16 (__n64, __n64); __n128 neon_sqshlq16(__n128, __n128); __n64 neon_sqshl32 (__n64, __n64); __n128 neon_sqshlq32(__n128, __n128); __n128 neon_sqshlq64(__n128, __n128); __n8 neon_sqshlis8(__n8, const int); __n16 neon_sqshlis16(__n16, const int); float neon_sqshlis32(float, const int); __n64 neon_sqshlis64(__n64, const int); __n8 neon_sqshls8(__n8, __n8); __n16 neon_sqshls16(__n16, __n16); float neon_sqshls32(float, float); __n64 neon_sqshls64(__n64, __n64); __n64 neon_uqshli8 (__n64, const int); __n128 neon_uqshliq8 (__n128, const int); __n64 neon_uqshli16 (__n64, const int); __n128 neon_uqshliq16(__n128, const int); __n64 neon_uqshli32 (__n64, const int); __n128 neon_uqshliq32(__n128, const int); __n128 neon_uqshliq64(__n128, const int); __n64 neon_uqshl8 (__n64, __n64); __n128 neon_uqshlq8 (__n128, __n128); __n64 neon_uqshl16 (__n64, __n64); __n128 neon_uqshlq16(__n128, __n128); __n64 neon_uqshl32 (__n64, __n64); __n128 neon_uqshlq32(__n128, __n128); __n128 neon_uqshlq64(__n128, __n128); __n8 neon_uqshlis8(__n8, const int); __n16 neon_uqshlis16(__n16, const int); float neon_uqshlis32(float, const int); __n64 neon_uqshlis64(__n64, const int); __n8 neon_uqshls8(__n8, __n8); __n16 neon_uqshls16(__n16, __n16); float neon_uqshls32(float, float); __n64 neon_uqshls64(__n64, __n64); __n64 neon_sqrshl8 (__n64, __n64); __n128 neon_sqrshlq8 (__n128, __n128); __n64 neon_sqrshl16 (__n64, __n64); __n128 neon_sqrshlq16(__n128, __n128); __n64 neon_sqrshl32 (__n64, __n64); __n128 neon_sqrshlq32(__n128, __n128); __n128 neon_sqrshlq64(__n128, __n128); __n8 neon_sqrshls8(__n8, __n8); __n16 neon_sqrshls16(__n16, __n16); float neon_sqrshls32(float, float); __n64 neon_sqrshls64(__n64, __n64); __n64 neon_urshl8 (__n64, __n64); __n128 neon_urshlq8 (__n128, __n128); __n64 neon_urshl16 (__n64, __n64); __n128 neon_urshlq16(__n128, __n128); __n64 neon_urshl32 (__n64, __n64); __n128 neon_urshlq32(__n128, __n128); __n128 neon_urshlq64(__n128, __n128); __n64 neon_urshls64(__n64, __n64); __n64 neon_srshl8 (__n64, __n64); __n128 neon_srshlq8 (__n128, __n128); __n64 neon_srshl16 (__n64, __n64); __n128 neon_srshlq16(__n128, __n128); __n64 neon_srshl32 (__n64, __n64); __n128 neon_srshlq32(__n128, __n128); __n128 neon_srshlq64(__n128, __n128); __n64 neon_srshls64(__n64, __n64); __n64 neon_ushl8 (__n64, __n64); __n128 neon_ushlq8 (__n128, __n128); __n64 neon_ushl16 (__n64, __n64); __n128 neon_ushlq16(__n128, __n128); __n64 neon_ushl32 (__n64, __n64); __n128 neon_ushlq32(__n128, __n128); __n128 neon_ushlq64(__n128, __n128); __n64 neon_ushls64(__n64, __n64); __n64 neon_uqrshl8 (__n64, __n64); __n128 neon_uqrshlq8 (__n128, __n128); __n64 neon_uqrshl16 (__n64, __n64); __n128 neon_uqrshlq16(__n128, __n128); __n64 neon_uqrshl32 (__n64, __n64); __n128 neon_uqrshlq32(__n128, __n128); __n128 neon_uqrshlq64(__n128, __n128); __n8 neon_uqrshls8(__n8, __n8); __n16 neon_uqrshls16(__n16, __n16); float neon_uqrshls32(float, float); __n64 neon_uqrshls64(__n64, __n64); __n64 neon_sshl8 (__n64, __n64); __n128 neon_sshlq8 (__n128, __n128); __n64 neon_sshl16 (__n64, __n64); __n128 neon_sshlq16(__n128, __n128); __n64 neon_sshl32 (__n64, __n64); __n128 neon_sshlq32(__n128, __n128); __n128 neon_sshlq64(__n128, __n128); __n64 neon_sshls64(__n64, __n64); #define vsri_n_p16(src1, src2, src3) neon_srii16(src1, src2, src3) #define vsri_n_p8(src1, src2, src3) neon_srii8(src1, src2, src3) #define vsri_n_s16(src1, src2, src3) neon_srii16(src1, src2, src3) #define vsri_n_s32(src1, src2, src3) neon_srii32(src1, src2, src3) #define vsri_n_s8(src1, src2, src3) neon_srii8(src1, src2, src3) #define vsri_n_u16(src1, src2, src3) neon_srii16(src1, src2, src3) #define vsri_n_u32(src1, src2, src3) neon_srii32(src1, src2, src3) #define vsri_n_u8(src1, src2, src3) neon_srii8(src1, src2, src3) #define vsriq_n_p16(src1, src2, src3) neon_sriiq16(src1, src2, src3) #define vsriq_n_p8(src1, src2, src3) neon_sriiq8(src1, src2, src3) #define vsriq_n_s16(src1, src2, src3) neon_sriiq16(src1, src2, src3) #define vsriq_n_s32(src1, src2, src3) neon_sriiq32(src1, src2, src3) #define vsriq_n_s64(src1, src2, src3) neon_sriiq64(src1, src2, src3) #define vsriq_n_s8(src1, src2, src3) neon_sriiq8(src1, src2, src3) #define vsriq_n_u16(src1, src2, src3) neon_sriiq16(src1, src2, src3) #define vsriq_n_u32(src1, src2, src3) neon_sriiq32(src1, src2, src3) #define vsriq_n_u64(src1, src2, src3) neon_sriiq64(src1, src2, src3) #define vsriq_n_u8(src1, src2, src3) neon_sriiq8(src1, src2, src3) #define vrshr_n_s16(src1, src2) neon_srshri16(src1, src2) #define vrshr_n_s32(src1, src2) neon_srshri32(src1, src2) #define vrshr_n_s8(src1, src2) neon_srshri8(src1, src2) #define vrshr_n_u16(src1, src2) neon_urshri16(src1, src2) #define vrshr_n_u32(src1, src2) neon_urshri32(src1, src2) #define vrshr_n_u8(src1, src2) neon_urshri8(src1, src2) #define vshr_n_s16(src1, src2) neon_sshri16(src1, src2) #define vshr_n_s32(src1, src2) neon_sshri32(src1, src2) #define vshr_n_s8(src1, src2) neon_sshri8(src1, src2) #define vshr_n_u16(src1, src2) neon_ushri16(src1, src2) #define vshr_n_u32(src1, src2) neon_ushri32(src1, src2) #define vshr_n_u64(src1, src2) neon_ushris64(src1, src2) #define vshr_n_u8(src1, src2) neon_ushri8(src1, src2) #define vrshrq_n_s16(src1, src2) neon_srshriq16(src1, src2) #define vrshrq_n_s32(src1, src2) neon_srshriq32(src1, src2) #define vrshrq_n_s64(src1, src2) neon_srshriq64(src1, src2) #define vrshrq_n_s8(src1, src2) neon_srshriq8(src1, src2) #define vrshrq_n_u16(src1, src2) neon_urshriq16(src1, src2) #define vrshrq_n_u32(src1, src2) neon_urshriq32(src1, src2) #define vrshrq_n_u64(src1, src2) neon_urshriq64(src1, src2) #define vrshrq_n_u8(src1, src2) neon_urshriq8(src1, src2) #define vshrq_n_s16(src1, src2) neon_sshriq16(src1, src2) #define vshrq_n_s32(src1, src2) neon_sshriq32(src1, src2) #define vshrq_n_s64(src1, src2) neon_sshriq64(src1, src2) #define vshrq_n_s8(src1, src2) neon_sshriq8(src1, src2) #define vshrq_n_u16(src1, src2) neon_ushriq16(src1, src2) #define vshrq_n_u32(src1, src2) neon_ushriq32(src1, src2) #define vshrq_n_u64(src1, src2) neon_ushriq64(src1, src2) #define vshrq_n_u8(src1, src2) neon_ushriq8(src1, src2) #define vrsra_n_s16(src1, src2, src3) neon_srsrai16(src1, src2, src3) #define vrsra_n_s32(src1, src2, src3) neon_srsrai32(src1, src2, src3) #define vrsra_n_s8(src1, src2, src3) neon_srsrai8(src1, src2, src3) #define vrsra_n_u16(src1, src2, src3) neon_ursrai16(src1, src2, src3) #define vrsra_n_u32(src1, src2, src3) neon_ursrai32(src1, src2, src3) #define vrsra_n_u8(src1, src2, src3) neon_ursrai8(src1, src2, src3) #define vsra_n_s16(src1, src2, src3) neon_ssrai16(src1, src2, src3) #define vsra_n_s32(src1, src2, src3) neon_ssrai32(src1, src2, src3) #define vsra_n_s8(src1, src2, src3) neon_ssrai8(src1, src2, src3) #define vsra_n_u16(src1, src2, src3) neon_usrai16(src1, src2, src3) #define vsra_n_u32(src1, src2, src3) neon_usrai32(src1, src2, src3) #define vsra_n_u8(src1, src2, src3) neon_usrai8(src1, src2, src3) #define vrsraq_n_s16(src1, src2, src3) neon_srsraiq16(src1, src2, src3) #define vrsraq_n_s32(src1, src2, src3) neon_srsraiq32(src1, src2, src3) #define vrsraq_n_s64(src1, src2, src3) neon_srsraiq64(src1, src2, src3) #define vrsraq_n_s8(src1, src2, src3) neon_srsraiq8(src1, src2, src3) #define vrsraq_n_u16(src1, src2, src3) neon_ursraiq16(src1, src2, src3) #define vrsraq_n_u32(src1, src2, src3) neon_ursraiq32(src1, src2, src3) #define vrsraq_n_u64(src1, src2, src3) neon_ursraiq64(src1, src2, src3) #define vrsraq_n_u8(src1, src2, src3) neon_ursraiq8(src1, src2, src3) #define vsraq_n_s16(src1, src2, src3) neon_ssraiq16(src1, src2, src3) #define vsraq_n_s32(src1, src2, src3) neon_ssraiq32(src1, src2, src3) #define vsraq_n_s64(src1, src2, src3) neon_ssraiq64(src1, src2, src3) #define vsraq_n_s8(src1, src2, src3) neon_ssraiq8(src1, src2, src3) #define vsraq_n_u16(src1, src2, src3) neon_usraiq16(src1, src2, src3) #define vsraq_n_u32(src1, src2, src3) neon_usraiq32(src1, src2, src3) #define vsraq_n_u64(src1, src2, src3) neon_usraiq64(src1, src2, src3) #define vsraq_n_u8(src1, src2, src3) neon_usraiq8(src1, src2, src3) #define vqshl_n_s16(src1, src2) neon_sqshli16(src1, src2) #define vqshl_n_s32(src1, src2) neon_sqshli32(src1, src2) #define vqshl_n_s8(src1, src2) neon_sqshli8(src1, src2) #define vqshl_n_u16(src1, src2) neon_uqshli16(src1, src2) #define vqshl_n_u32(src1, src2) neon_uqshli32(src1, src2) #define vqshl_n_u8(src1, src2) neon_uqshli8(src1, src2) #define vqshlq_n_s16(src1, src2) neon_sqshliq16(src1, src2) #define vqshlq_n_s32(src1, src2) neon_sqshliq32(src1, src2) #define vqshlq_n_s64(src1, src2) neon_sqshliq64(src1, src2) #define vqshlq_n_s8(src1, src2) neon_sqshliq8(src1, src2) #define vqshlq_n_u16(src1, src2) neon_uqshliq16(src1, src2) #define vqshlq_n_u32(src1, src2) neon_uqshliq32(src1, src2) #define vqshlq_n_u64(src1, src2) neon_uqshliq64(src1, src2) #define vqshlq_n_u8(src1, src2) neon_uqshliq8(src1, src2) #define vqshlu_n_s16(src1, src2) neon_sqshlui16(src1, src2) #define vqshlu_n_s32(src1, src2) neon_sqshlui32(src1, src2) #define vqshlu_n_s8(src1, src2) neon_sqshlui8(src1, src2) #define vqshluq_n_s16(src1, src2) neon_sqshluiq16(src1, src2) #define vqshluq_n_s32(src1, src2) neon_sqshluiq32(src1, src2) #define vqshluq_n_s64(src1, src2) neon_sqshluiq64(src1, src2) #define vqshluq_n_s8(src1, src2) neon_sqshluiq8(src1, src2) #define vshl_n_s16(src1, src2) neon_shli16(src1, src2) #define vshl_n_s32(src1, src2) neon_shli32(src1, src2) #define vshl_n_s8(src1, src2) neon_shli8(src1, src2) #define vshl_n_u16(src1, src2) neon_shli16(src1, src2) #define vshl_n_u32(src1, src2) neon_shli32(src1, src2) #define vshl_n_u8(src1, src2) neon_shliq8(src1, src2) #define vshlq_n_s16(src1, src2) neon_shliq16(src1, src2) #define vshlq_n_s32(src1, src2) neon_shliq32(src1, src2) #define vshlq_n_s64(src1, src2) neon_shliq64(src1, src2) #define vshlq_n_s8(src1, src2) neon_shliq8(src1, src2) #define vshlq_n_u16(src1, src2) neon_shliq16(src1, src2) #define vshlq_n_u32(src1, src2) neon_shliq32(src1, src2) #define vshlq_n_u64(src1, src2) neon_shliq64(src1, src2) #define vshlq_n_u8(src1, src2) neon_shliq8(src1, src2) #define vqrshl_s16(src1, src2) neon_sqrshl16(src1, src2) #define vqrshl_s32(src1, src2) neon_sqrshl32(src1, src2) #define vqrshl_s8(src1, src2) neon_sqrshl8(src1, src2) #define vqrshl_u16(src1, src2) neon_uqrshl16(src1, src2) #define vqrshl_u32(src1, src2) neon_uqrshl32(src1, src2) #define vqrshl_u8(src1, src2) neon_uqrshl8(src1, src2) #define vqshl_s16(src1, src2) neon_sqshl16(src1, src2) #define vqshl_s32(src1, src2) neon_sqshl32(src1, src2) #define vqshl_s8(src1, src2) neon_sqshl8(src1, src2) #define vqshl_u16(src1, src2) neon_uqshl16(src1, src2) #define vqshl_u32(src1, src2) neon_uqshl32(src1, src2) #define vqshl_u8(src1, src2) neon_uqshl8(src1, src2) #define vrshl_s16(src1, src2) neon_srshl16(src1, src2) #define vrshl_s32(src1, src2) neon_srshl32(src1, src2) #define vrshl_s8(src1, src2) neon_srshl8(src1, src2) #define vrshl_u16(src1, src2) neon_urshl16(src1, src2) #define vrshl_u32(src1, src2) neon_urshl32(src1, src2) #define vrshl_u8(src1, src2) neon_urshl8(src1, src2) #define vshl_s16(src1, src2) neon_sshl16(src1, src2) #define vshl_s32(src1, src2) neon_sshl32(src1, src2) #define vshl_s8(src1, src2) neon_sshl8(src1, src2) #define vshl_u16(src1, src2) neon_ushl16(src1, src2) #define vshl_u32(src1, src2) neon_ushl32(src1, src2) #define vshl_u8(src1, src2) neon_ushl8(src1, src2) #define vqrshlq_s16(src1, src2) neon_sqrshlq16(src1, src2) #define vqrshlq_s32(src1, src2) neon_sqrshlq32(src1, src2) #define vqrshlq_s64(src1, src2) neon_sqrshlq64(src1, src2) #define vqrshlq_s8(src1, src2) neon_sqrshlq8(src1, src2) #define vqrshlq_u16(src1, src2) neon_uqrshlq16(src1, src2) #define vqrshlq_u32(src1, src2) neon_uqrshlq32(src1, src2) #define vqrshlq_u64(src1, src2) neon_uqrshlq64(src1, src2) #define vqrshlq_u8(src1, src2) neon_uqrshlq8(src1, src2) #define vqshlq_s16(src1, src2) neon_sqshlq16(src1, src2) #define vqshlq_s32(src1, src2) neon_sqshlq32(src1, src2) #define vqshlq_s64(src1, src2) neon_sqshlq64(src1, src2) #define vqshlq_s8(src1, src2) neon_sqshlq8(src1, src2) #define vqshlq_u16(src1, src2) neon_uqshlq16(src1, src2) #define vqshlq_u32(src1, src2) neon_uqshlq32(src1, src2) #define vqshlq_u64(src1, src2) neon_uqshlq64(src1, src2) #define vqshlq_u8(src1, src2) neon_uqshlq8(src1, src2) #define vrshlq_s16(src1, src2) neon_srshlq16(src1, src2) #define vrshlq_s32(src1, src2) neon_srshlq32(src1, src2) #define vrshlq_s64(src1, src2) neon_srshlq64(src1, src2) #define vrshlq_s8(src1, src2) neon_srshlq8(src1, src2) #define vrshlq_u16(src1, src2) neon_urshlq16(src1, src2) #define vrshlq_u32(src1, src2) neon_urshlq32(src1, src2) #define vrshlq_u64(src1, src2) neon_urshlq64(src1, src2) #define vrshlq_u8(src1, src2) neon_urshlq8(src1, src2) #define vshlq_s16(src1, src2) neon_sshlq16(src1, src2) #define vshlq_s32(src1, src2) neon_sshlq32(src1, src2) #define vshlq_s64(src1, src2) neon_sshlq64(src1, src2) #define vshlq_s8(src1, src2) neon_sshlq8(src1, src2) #define vshlq_u16(src1, src2) neon_ushlq16(src1, src2) #define vshlq_u32(src1, src2) neon_ushlq32(src1, src2) #define vshlq_u64(src1, src2) neon_ushlq64(src1, src2) #define vshlq_u8(src1, src2) neon_ushlq8(src1, src2) #define vsli_n_p16(src1, src2, src3) neon_slii16(src1, src2, src3) #define vsli_n_p8(src1, src2, src3) neon_slii8(src1, src2, src3) #define vsli_n_s16(src1, src2, src3) neon_slii16(src1, src2, src3) #define vsli_n_s32(src1, src2, src3) neon_slii32(src1, src2, src3) #define vsli_n_s8(src1, src2, src3) neon_slii8(src1, src2, src3) #define vsli_n_u16(src1, src2, src3) neon_slii16(src1, src2, src3) #define vsli_n_u32(src1, src2, src3) neon_slii32(src1, src2, src3) #define vsli_n_u8(src1, src2, src3) neon_slii8(src1, src2, src3) #define vsliq_n_p16(src1, src2, src3) neon_sliiq16(src1, src2, src3) #define vsliq_n_p8(src1, src2, src3) neon_sliiq8(src1, src2, src3) #define vsliq_n_s16(src1, src2, src3) neon_sliiq16(src1, src2, src3) #define vsliq_n_s32(src1, src2, src3) neon_sliiq32(src1, src2, src3) #define vsliq_n_s64(src1, src2, src3) neon_sliiq64(src1, src2, src3) #define vsliq_n_s8(src1, src2, src3) neon_sliiq8(src1, src2, src3) #define vsliq_n_u16(src1, src2, src3) neon_sliiq16(src1, src2, src3) #define vsliq_n_u32(src1, src2, src3) neon_sliiq32(src1, src2, src3) #define vsliq_n_u64(src1, src2, src3) neon_sliiq64(src1, src2, src3) #define vsliq_n_u8(src1, src2, src3) neon_sliiq8(src1, src2, src3) // TBL/TBX __n64 neon_tbx4_q8(__n64 src1, __n128x4 reglist, __n64 src2); __n128 neon_tbx4_qq8(__n128 src1, __n128x4 reglist, __n128 src2); __n64 neon_tbx3_q8(__n64 src1, __n128x3 reglist, __n64 src2); __n128 neon_tbx3_qq8(__n128 src1, __n128x3 reglist, __n128 src2); __n64 neon_tbx2_q8(__n64 src1, __n128x2 reglist, __n64 src2); __n128 neon_tbx2_qq8(__n128 src1, __n128x2 reglist, __n128 src2); __n64 neon_tbx1_q8(__n64 src1, __n128 reglist, __n64 src2); __n128 neon_tbx1_qq8(__n128 src1, __n128 reglist, __n128 src2); __n64 neon_tbl4_q8(__n128x4 reglist, __n64 src2); __n128 neon_tbl4_qq8(__n128x4 reglist, __n128 src2); __n64 neon_tbl3_q8(__n128x3 reglist, __n64 src2); __n128 neon_tbl3_qq8(__n128x3 reglist, __n128 src2); __n64 neon_tbl2_q8(__n128x2 reglist, __n64 src2); __n128 neon_tbl2_qq8(__n128x2 reglist, __n128 src2); __n64 neon_tbl1_q8(__n128 reglist, __n64 src2); __n128 neon_tbl1_qq8(__n128 reglist, __n128 src2); #define neon_tbx4_8(src1, src2, src3) neon_tbx2_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0), vget_lane_s64(src2.val[2], 0), vget_lane_s64(src2.val[3], 0)}, src3) #define neon_tbx3_8(src1, src2, src3) neon_tbx2_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0), vget_lane_s64(src2.val[2], 0), 0}, src3) #define neon_tbx2_8(src1, src2, src3) neon_tbx1_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0)}, src3) #define neon_tbx1_8(src1, src2, src3) neon_tbx1_q8(src1, {vget_lane_s64(src2, 0), 0}, src3) #define neon_tbl4_8(src1, src2) neon_tbl2_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0), vget_lane_s64(src1.val[2], 0), vget_lane_s64(src1.val[3], 0)}, src2) #define neon_tbl3_8(src1, src2) neon_tbl2_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0), vget_lane_s64(src1.val[2], 0), 0}, src2) #define neon_tbl2_8(src1, src2) neon_tbl1_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0)}, src2) #define neon_tbl1_8(src1, src2) neon_tbl1_q8({vget_lane_s64(src1, 0), 0}, src2) #define vtbx4_p8(src1, src2, src3) neon_tbx2_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0), vget_lane_s64(src2.val[2], 0), vget_lane_s64(src2.val[3], 0)}, src3) #define vtbx4_s8(src1, src2, src3) neon_tbx2_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0), vget_lane_s64(src2.val[2], 0), vget_lane_s64(src2.val[3], 0)}, src3) #define vtbx4_u8(src1, src2, src3) neon_tbx2_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0), vget_lane_s64(src2.val[2], 0), vget_lane_s64(src2.val[3], 0)}, src3) #define vtbx3_p8(src1, src2, src3) neon_tbx2_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0), vget_lane_s64(src2.val[2], 0), 0}, src3) #define vtbx3_s8(src1, src2, src3) neon_tbx2_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0), vget_lane_s64(src2.val[2], 0), 0}, src3) #define vtbx3_u8(src1, src2, src3) neon_tbx2_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0), vget_lane_s64(src2.val[2], 0), 0}, src3) #define vtbx2_p8(src1, src2, src3) neon_tbx1_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0)}, src3) #define vtbx2_s8(src1, src2, src3) neon_tbx1_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0)}, src3) #define vtbx2_u8(src1, src2, src3) neon_tbx1_q8(src1, {vget_lane_s64(src2.val[0], 0), vget_lane_s64(src2.val[1], 0)}, src3) #define vtbx1_p8(src1, src2, src3) neon_tbx1_q8(src1, {vget_lane_s64(src2, 0), 0}, src3) #define vtbx1_s8(src1, src2, src3) neon_tbx1_q8(src1, {vget_lane_s64(src2, 0), 0}, src3) #define vtbx1_u8(src1, src2, src3) neon_tbx1_q8(src1, {vget_lane_u64(src2, 0), 0}, src3) #define vtbl4_p8(src1, src2) neon_tbl2_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0), vget_lane_s64(src1.val[2], 0), vget_lane_s64(src1.val[3], 0)}, src2) #define vtbl4_s8(src1, src2) neon_tbl2_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0), vget_lane_s64(src1.val[2], 0), vget_lane_s64(src1.val[3], 0)}, src2) #define vtbl4_u8(src1, src2) neon_tbl2_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0), vget_lane_s64(src1.val[2], 0), vget_lane_s64(src1.val[3], 0)}, src2) #define vtbl3_p8(src1, src2) neon_tbl2_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0), vget_lane_s64(src1.val[2], 0), 0}, src2) #define vtbl3_s8(src1, src2) neon_tbl2_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0), vget_lane_s64(src1.val[2], 0), 0}, src2) #define vtbl3_u8(src1, src2) neon_tbl2_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0), vget_lane_s64(src1.val[2], 0), 0}, src2) #define vtbl2_p8(src1, src2) neon_tbl1_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0)}, src2) #define vtbl2_s8(src1, src2) neon_tbl1_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0)}, src2) #define vtbl2_u8(src1, src2) neon_tbl1_q8({vget_lane_s64(src1.val[0], 0), vget_lane_s64(src1.val[1], 0)}, src2) #define vtbl1_p8(src1, src2) neon_tbl1_q8({vget_lane_s64(src1, 0), 0}, src2) #define vtbl1_s8(src1, src2) neon_tbl1_q8({vget_lane_s64(src1, 0), 0}, src2) #define vtbl1_u8(src1, src2) neon_tbl1_q8({vget_lane_u64(src1, 0), 0}, src2) #define vqtbl1q_u8(src1, src2) neon_tbl1_qq8(src1, src2) // LD4R/LD4/LD3R/LD3/LD2R/LD2 __n64x4 neon_ld4r_8(const __int8 * ptr); __n128x4 neon_ld4r_q8(const __int8 * ptr); __n64x4 neon_ld4r_16(const __int16 * ptr); __n128x4 neon_ld4r_q16(const __int16 * ptr); __n64x4 neon_ld4r_32(const __int32 * ptr); __n128x4 neon_ld4r_q32(const __int32 * ptr); __n64x4 neon_ld4r_64(const __int64 * ptr); __n128x4 neon_ld4r_q64(const __int64 * ptr); __n64x4 neon_ld4m_8(const __int8 * ptr); __n128x4 neon_ld4m_q8(const __int8 * ptr); __n64x4 neon_ld4m_16(const __int16 * ptr); __n128x4 neon_ld4m_q16(const __int16 * ptr); __n64x4 neon_ld4m_32(const __int32 * ptr); __n128x4 neon_ld4m_q32(const __int32 * ptr); __n128x4 neon_ld4m_q64(const __int64 * ptr); __n64x4 neon_ld4s_8(const __int8 * ptr, __n64x4 src, const int lane); __n128x4 neon_ld4s_q8(const __int8 * ptr, __n128x4 src, const int lane); __n64x4 neon_ld4s_16(const __int16 * ptr, __n64x4 src, const int lane); __n128x4 neon_ld4s_q16(const __int16 * ptr, __n128x4 src, const int lane); __n64x4 neon_ld4s_32(const __int32 * ptr, __n64x4 src, const int lane); __n128x4 neon_ld4s_q32(const __int32 * ptr, __n128x4 src, const int lane); __n64x4 neon_ld4s_64(const __int64 * ptr, __n64x4 src, const int lane); __n128x4 neon_ld4s_q64(const __int64 * ptr, __n128x4 src, const int lane); __n64x3 neon_ld3r_8(const __int8 * ptr); __n128x3 neon_ld3r_q8(const __int8 * ptr); __n64x3 neon_ld3r_16(const __int16 * ptr); __n128x3 neon_ld3r_q16(const __int16 * ptr); __n64x3 neon_ld3r_32(const __int32 * ptr); __n128x3 neon_ld3r_q32(const __int32 * ptr); __n64x3 neon_ld3r_64(const __int64 * ptr); __n128x3 neon_ld3r_q64(const __int64 * ptr); __n64x3 neon_ld3m_8(const __int8 * ptr); __n128x3 neon_ld3m_q8(const __int8 * ptr); __n64x3 neon_ld3m_16(const __int16 * ptr); __n128x3 neon_ld3m_q16(const __int16 * ptr); __n64x3 neon_ld3m_32(const __int32 * ptr); __n128x3 neon_ld3m_q32(const __int32 * ptr); __n128x3 neon_ld3m_q64(const __int64 * ptr); __n64x3 neon_ld3s_8(const __int8 * ptr, __n64x3 src, const int lane); __n128x3 neon_ld3s_q8(const __int8 * ptr, __n128x3 src, const int lane); __n64x3 neon_ld3s_16(const __int16 * ptr, __n64x3 src, const int lane); __n128x3 neon_ld3s_q16(const __int16 * ptr, __n128x3 src, const int lane); __n64x3 neon_ld3s_32(const __int32 * ptr, __n64x3 src, const int lane); __n128x3 neon_ld3s_q32(const __int32 * ptr, __n128x3 src, const int lane); __n64x3 neon_ld3s_64(const __int64 * ptr, __n64x3 src, const int lane); __n128x3 neon_ld3s_q64(const __int64 * ptr, __n128x3 src, const int lane); __n64x2 neon_ld2r_8(const __int8 * ptr); __n128x2 neon_ld2r_q8(const __int8 * ptr); __n64x2 neon_ld2r_16(const __int16 * ptr); __n128x2 neon_ld2r_q16(const __int16 * ptr); __n64x2 neon_ld2r_32(const __int32 * ptr); __n128x2 neon_ld2r_q32(const __int32 * ptr); __n64x2 neon_ld2r_64(const __int64 * ptr); __n128x2 neon_ld2r_q64(const __int64 * ptr); __n64x2 neon_ld2m_8(const __int8 * ptr); __n128x2 neon_ld2m_q8(const __int8 * ptr); __n64x2 neon_ld2m_16(const __int16 * ptr); __n128x2 neon_ld2m_q16(const __int16 * ptr); __n64x2 neon_ld2m_32(const __int32 * ptr); __n128x2 neon_ld2m_q32(const __int32 * ptr); __n128x2 neon_ld2m_q64(const __int64 * ptr); __n64x2 neon_ld2s_8(const __int8 * ptr, __n64x2 src, const int lane); __n128x2 neon_ld2s_q8(const __int8 * ptr, __n128x2 src, const int lane); __n64x2 neon_ld2s_16(const __int16 * ptr, __n64x2 src, const int lane); __n128x2 neon_ld2s_q16(const __int16 * ptr, __n128x2 src, const int lane); __n64x2 neon_ld2s_32(const __int32 * ptr, __n64x2 src, const int lane); __n128x2 neon_ld2s_q32(const __int32 * ptr, __n128x2 src, const int lane); __n64x2 neon_ld2s_64(const __int64 * ptr, __n64x2 src, const int lane); __n128x2 neon_ld2s_q64(const __int64 * ptr, __n128x2 src, const int lane); __n64 neon_ld1r_8(const __int8 * ptr); __n128 neon_ld1r_q8(const __int8 * ptr); __n64 neon_ld1r_16(const __int16 * ptr); __n128 neon_ld1r_q16(const __int16 * ptr); __n64 neon_ld1r_32(const __int32 * ptr); __n128 neon_ld1r_q32(const __int32 * ptr); __n64 neon_ld1r_64(const __int64 * ptr); __n128 neon_ld1r_q64(const __int64 * ptr); __n64 neon_ld1m_8(const __int8 * ptr); __n128 neon_ld1m_q8(const __int8 * ptr); __n64 neon_ld1m_16(const __int16 * ptr); __n128 neon_ld1m_q16(const __int16 * ptr); __n64 neon_ld1m_32(const __int32 * ptr); __n128 neon_ld1m_q32(const __int32 * ptr); __n64 neon_ld1m_64(const __int64 * ptr); __n128 neon_ld1m_q64(const __int64 * ptr); __n64x2 neon_ld1m2_8(const __int8 * ptr); __n128x2 neon_ld1m2_q8(const __int8 * ptr); __n64x2 neon_ld1m2_16(const __int16 * ptr); __n128x2 neon_ld1m2_q16(const __int16 * ptr); __n64x2 neon_ld1m2_32(const __int32 * ptr); __n128x2 neon_ld1m2_q32(const __int32 * ptr); __n64x2 neon_ld1m2_64(const __int64 * ptr); __n128x2 neon_ld1m2_q64(const __int64 * ptr); __n64x3 neon_ld1m3_8(const __int8 * ptr); __n128x3 neon_ld1m3_q8(const __int8 * ptr); __n64x3 neon_ld1m3_16(const __int16 * ptr); __n128x3 neon_ld1m3_q16(const __int16 * ptr); __n64x3 neon_ld1m3_32(const __int32 * ptr); __n128x3 neon_ld1m3_q32(const __int32 * ptr); __n64x3 neon_ld1m3_64(const __int64 * ptr); __n128x3 neon_ld1m3_q64(const __int64 * ptr); __n64x4 neon_ld1m4_8(const __int8 * ptr); __n128x4 neon_ld1m4_q8(const __int8 * ptr); __n64x4 neon_ld1m4_16(const __int16 * ptr); __n128x4 neon_ld1m4_q16(const __int16 * ptr); __n64x4 neon_ld1m4_32(const __int32 * ptr); __n128x4 neon_ld1m4_q32(const __int32 * ptr); __n64x4 neon_ld1m4_64(const __int64 * ptr); __n128x4 neon_ld1m4_q64(const __int64 * ptr); __n64 neon_ld1s_8(const __int8 * ptr, __n64 src, const int lane); __n128 neon_ld1s_q8(const __int8 * ptr, __n128 src, const int lane); __n64 neon_ld1s_16(const __int16 * ptr, __n64 src, const int lane); __n128 neon_ld1s_q16(const __int16 * ptr, __n128 src, const int lane); __n64 neon_ld1s_32(const __int32 * ptr, __n64 src, const int lane); __n128 neon_ld1s_q32(const __int32 * ptr, __n128 src, const int lane); __n64 neon_ld1s_64(const __int64 * ptr, __n64 src, const int lane); __n128 neon_ld1s_q64(const __int64 * ptr, __n128 src, const int lane); #define vld4_dup_f32(src) neon_ld4r_32((__int32*)src) #define vld4_dup_p16(src) neon_ld4r_16((__int16*)src) #define vld4_dup_p8(src) neon_ld4r_8((__int8*)src) #define vld4_dup_s16(src) neon_ld4r_16((__int16*)src) #define vld4_dup_s32(src) neon_ld4r_32((__int32*)src) #define vld4_dup_s8(src) neon_ld4r_8((__int8*)src) #define vld4_dup_u16(src) neon_ld4r_16((__int16*)src) #define vld4_dup_u32(src) neon_ld4r_32((__int32*)src) #define vld4_dup_u8(src) neon_ld4r_8((__int8*)src) #define vld4_dup_s64(src) neon_ld4r_64((__int64*)src) #define vld4_dup_u64(src) neon_ld4r_64((__int64*)src) #define vld4_f32(src) neon_ld4m_32((__int32*)src) #define vld4_p16(src) neon_ld4m_16((__int16*)src) #define vld4_p8(src) neon_ld4m_8((__int8*)src) #define vld4_s16(src) neon_ld4m_16((__int16*)src) #define vld4_s32(src) neon_ld4m_32((__int32*)src) #define vld4_s8(src) neon_ld4m_8((__int8*)src) #define vld4_u16(src) neon_ld4m_16((__int16*)src) #define vld4_u32(src) neon_ld4m_32((__int32*)src) #define vld4_u8(src) neon_ld4m_8((__int8*)src) #define vld4_s64(src) neon_ld1m4_64((__int64*)src) #define vld4_u64(src) neon_ld1m4_64((__int64*)src) #define vld4q_dup_f32(src) neon_ld4r_q32((__int32*)src) #define vld4q_dup_p16(src) neon_ld4r_q16((__int16*)src) #define vld4q_dup_p8(src) neon_ld4r_q8((__int8*)src) #define vld4q_dup_s16(src) neon_ld4r_q16((__int16*)src) #define vld4q_dup_s32(src) neon_ld4r_q32((__int32*)src) #define vld4q_dup_s8(src) neon_ld4r_q8((__int8*)src) #define vld4q_dup_u16(src) neon_ld4r_q16((__int16*)src) #define vld4q_dup_u32(src) neon_ld4r_q32((__int32*)src) #define vld4q_dup_u8(src) neon_ld4r_q8((__int8*)src) #define vld4q_dup_s64(src) neon_ld4r_q64((__int64*)src) #define vld4q_dup_u64(src) neon_ld4r_q64((__int64*)src) #define vld4q_f32(src) neon_ld4m_q32((__int32*)src) #define vld4q_p16(src) neon_ld4m_q16((__int16*)src) #define vld4q_p8(src) neon_ld4m_q8((__int8*)src) #define vld4q_s16(src) neon_ld4m_q16((__int16*)src) #define vld4q_s32(src) neon_ld4m_q32((__int32*)src) #define vld4q_s8(src) neon_ld4m_q8((__int8*)src) #define vld4q_u16(src) neon_ld4m_q16((__int16*)src) #define vld4q_u32(src) neon_ld4m_q32((__int32*)src) #define vld4q_u8(src) neon_ld4m_q8((__int8*)src) #define vld4q_s64(src) neon_ld4m_q64((__int64*)src) #define vld4q_u64(src) neon_ld4m_q64((__int64*)src) #define vld4_lane_f32(src1, src2, src3) neon_ld4s_32((__int32*)src1, src2, src3) #define vld4_lane_p16(src1, src2, src3) neon_ld4s_16((__int16*)src1, src2, src3) #define vld4_lane_p8(src1, src2, src3) neon_ld4s_8((__int8*)src1, src2, src3) #define vld4_lane_s16(src1, src2, src3) neon_ld4s_16((__int16*)src1, src2, src3) #define vld4_lane_s32(src1, src2, src3) neon_ld4s_32((__int32*)src1, src2, src3) #define vld4_lane_s64(src1, src2, src3) neon_ld4s_64((__int64*)src1, src2, src3) #define vld4_lane_s8(src1, src2, src3) neon_ld4s_8((__int8*)src1, src2, src3) #define vld4_lane_u16(src1, src2, src3) neon_ld4s_16((__int16*)src1, src2, src3) #define vld4_lane_u32(src1, src2, src3) neon_ld4s_32((__int32*)src1, src2, src3) #define vld4_lane_u8(src1, src2, src3) neon_ld4s_8((__int8*)src1, src2, src3) #define vld4q_lane_f32(src1, src2, src3) neon_ld4s_q32((__int32*)src1, src2, src3) #define vld4q_lane_p8(src1, src2, src3) neon_ld4s_q8((__int8*)src1, src2, src3) #define vld4q_lane_p16(src1, src2, src3) neon_ld4s_q16((__int16*)src1, src2, src3) #define vld4q_lane_s16(src1, src2, src3) neon_ld4s_q16((__int16*)src1, src2, src3) #define vld4q_lane_s32(src1, src2, src3) neon_ld4s_q32((__int32*)src1, src2, src3) #define vld4q_lane_s64(src1, src2, src3) neon_ld4s_q64((__int64*)src1, src2, src3) #define vld4q_lane_u16(src1, src2, src3) neon_ld4s_q16((__int16*)src1, src2, src3) #define vld4q_lane_u32(src1, src2, src3) neon_ld4s_q32((__int32*)src2, src3) #define vld3_dup_f32(src) neon_ld3r_32((__int32*)src) #define vld3_dup_p16(src) neon_ld3r_16((__int16*)src) #define vld3_dup_p8(src) neon_ld3r_8((__int8*)src) #define vld3_dup_s16(src) neon_ld3r_16((__int16*)src) #define vld3_dup_s32(src) neon_ld3r_32((__int32*)src) #define vld3_dup_s8(src) neon_ld3r_8((__int8*)src) #define vld3_dup_u16(src) neon_ld3r_16((__int16*)src) #define vld3_dup_u32(src) neon_ld3r_32((__int32*)src) #define vld3_dup_u8(src) neon_ld3r_8((__int8*)src) #define vld3_dup_s64(src) neon_ld3r_64((__int64*)src) #define vld3_dup_u64(src) neon_ld3r_64((__int64*)src) #define vld3_f32(src) neon_ld3m_32((__int32*)src) #define vld3_p16(src) neon_ld3m_16((__int16*)src) #define vld3_p8(src) neon_ld3m_8((__int8*)src) #define vld3_s16(src) neon_ld3m_16((__int16*)src) #define vld3_s32(src) neon_ld3m_32((__int32*)src) #define vld3_s8(src) neon_ld3m_8((__int8*)src) #define vld3_u16(src) neon_ld3m_16((__int16*)src) #define vld3_u32(src) neon_ld3m_32((__int32*)src) #define vld3_u8(src) neon_ld3m_8((__int8*)src) #define vld3_s64(src) neon_ld1m3_64((__int64*)src) #define vld3_u64(src) neon_ld1m3_64((__int64*)src) #define vld3q_dup_f32(src) neon_ld3r_q32((__int32*)src) #define vld3q_dup_p16(src) neon_ld3r_q16((__int16*)src) #define vld3q_dup_p8(src) neon_ld3r_q8((__int8*)src) #define vld3q_dup_s16(src) neon_ld3r_q16((__int16*)src) #define vld3q_dup_s32(src) neon_ld3r_q32((__int32*)src) #define vld3q_dup_s8(src) neon_ld3r_q8((__int8*)src) #define vld3q_dup_u16(src) neon_ld3r_q16((__int16*)src) #define vld3q_dup_u32(src) neon_ld3r_q32((__int32*)src) #define vld3q_dup_u8(src) neon_ld3r_q8((__int8*)src) #define vld3q_dup_s64(src) neon_ld3r_q64((__int64*)src) #define vld3q_dup_u64(src) neon_ld3r_q64((__int64*)src) #define vld3q_f32(src) neon_ld3m_q32((__int32*)src) #define vld3q_p16(src) neon_ld3m_q16((__int16*)src) #define vld3q_p8(src) neon_ld3m_q8((__int8*)src) #define vld3q_s16(src) neon_ld3m_q16((__int16*)src) #define vld3q_s32(src) neon_ld3m_q32((__int32*)src) #define vld3q_s8(src) neon_ld3m_q8((__int8*)src) #define vld3q_u16(src) neon_ld3m_q16((__int16*)src) #define vld3q_u32(src) neon_ld3m_q32((__int32*)src) #define vld3q_u8(src) neon_ld3m_q8((__int8*)src) #define vld3q_s64(src) neon_ld3m_q64((__int64*)src) #define vld3q_u64(src) neon_ld3m_q64((__int64*)src) #define vld3_lane_f32(src1, src2, src3) neon_ld3s_32((__int32*)src1, src2, src3) #define vld3_lane_p16(src1, src2, src3) neon_ld3s_16((__int16*)src1, src2, src3) #define vld3_lane_p8(src1, src2, src3) neon_ld3s_8((__int8*)src1, src2, src3) #define vld3_lane_s16(src1, src2, src3) neon_ld3s_16((__int16*)src1, src2, src3) #define vld3_lane_s32(src1, src2, src3) neon_ld3s_32((__int32*)src1, src2, src3) #define vld3_lane_s64(src1, src2, src3) neon_ld3s_64((__int64*)src1, src2, src3) #define vld3_lane_s8(src1, src2, src3) neon_ld3s_8((__int8*)src1, src2, src3) #define vld3_lane_u16(src1, src2, src3) neon_ld3s_16((__int16*)src1, src2, src3) #define vld3_lane_u32(src1, src2, src3) neon_ld3s_32((__int32*)src1, src2, src3) #define vld3_lane_u8(src1, src2, src3) neon_ld3s_8((__int8*)src1, src2, src3) #define vld3q_lane_f32(src1, src2, src3) neon_ld3s_q32((__int32*)src1, src2, src3) #define vld3q_lane_p8(src1, src2, src3) neon_ld3s_q8((__int8*)src1, src2, src3) #define vld3q_lane_p16(src1, src2, src3) neon_ld3s_q16((__int16*)src1, src2, src3) #define vld3q_lane_s16(src1, src2, src3) neon_ld3s_q16((__int16*)src1, src2, src3) #define vld3q_lane_s32(src1, src2, src3) neon_ld3s_q32((__int32*)src1, src2, src3) #define vld3q_lane_s64(src1, src2, src3) neon_ld3s_q64((__int64*)src1, src2, src3) #define vld3q_lane_u16(src1, src2, src3) neon_ld3s_q16((__int16*)src1, src2, src3) #define vld3q_lane_u32(src1, src2, src3) neon_ld3s_q32((__int32*)src2, src3) #define vld2_dup_f32(src) neon_ld2r_32((__int32*)src) #define vld2_dup_p16(src) neon_ld2r_16((__int16*)src) #define vld2_dup_p8(src) neon_ld2r_8((__int8*)src) #define vld2_dup_s16(src) neon_ld2r_16((__int16*)src) #define vld2_dup_s32(src) neon_ld2r_32((__int32*)src) #define vld2_dup_s8(src) neon_ld2r_8((__int8*)src) #define vld2_dup_u16(src) neon_ld2r_16((__int16*)src) #define vld2_dup_u32(src) neon_ld2r_32((__int32*)src) #define vld2_dup_u8(src) neon_ld2r_8((__int8*)src) #define vld2_dup_s64(src) neon_ld2r_64((__int64*)src) #define vld2_dup_u64(src) neon_ld2r_64((__int64*)src) #define vld2_f32(src) neon_ld2m_32((__int32*)src) #define vld2_p16(src) neon_ld2m_16((__int16*)src) #define vld2_p8(src) neon_ld2m_8((__int8*)src) #define vld2_s16(src) neon_ld2m_16((__int16*)src) #define vld2_s32(src) neon_ld2m_32((__int32*)src) #define vld2_s8(src) neon_ld2m_8((__int8*)src) #define vld2_u16(src) neon_ld2m_16((__int16*)src) #define vld2_u32(src) neon_ld2m_32((__int32*)src) #define vld2_u8(src) neon_ld2m_8((__int8*)src) #define vld2_s64(src) neon_ld1m2_64((__int64*)src) #define vld2_u64(src) neon_ld1m2_64((__int64*)src) #define vld2q_dup_f32(src) neon_ld2r_q32((__int32*)src) #define vld2q_dup_p16(src) neon_ld2r_q16((__int16*)src) #define vld2q_dup_p8(src) neon_ld2r_q8((__int8*)src) #define vld2q_dup_s16(src) neon_ld2r_q16((__int16*)src) #define vld2q_dup_s32(src) neon_ld2r_q32((__int32*)src) #define vld2q_dup_s8(src) neon_ld2r_q8((__int8*)src) #define vld2q_dup_u16(src) neon_ld2r_q16((__int16*)src) #define vld2q_dup_u32(src) neon_ld2r_q32((__int32*)src) #define vld2q_dup_u8(src) neon_ld2r_q8((__int8*)src) #define vld2q_dup_s64(src) neon_ld2r_q64((__int64*)src) #define vld2q_dup_u64(src) neon_ld2r_q64((__int64*)src) #define vld2q_f32(src) neon_ld2m_q32((__int32*)src) #define vld2q_p16(src) neon_ld2m_q16((__int16*)src) #define vld2q_p8(src) neon_ld2m_q8((__int8*)src) #define vld2q_s16(src) neon_ld2m_q16((__int16*)src) #define vld2q_s32(src) neon_ld2m_q32((__int32*)src) #define vld2q_s8(src) neon_ld2m_q8((__int8*)src) #define vld2q_u16(src) neon_ld2m_q16((__int16*)src) #define vld2q_u32(src) neon_ld2m_q32((__int32*)src) #define vld2q_u8(src) neon_ld2m_q8((__int8*)src) #define vld2q_s64(src) neon_ld2m_q64((__int64*)src) #define vld2q_u64(src) neon_ld2m_q64((__int64*)src) #define vld2_lane_f32(src1, src2, src3) neon_ld2s_32((__int32*)src1, src2, src3) #define vld2_lane_p16(src1, src2, src3) neon_ld2s_16((__int16*)src1, src2, src3) #define vld2_lane_p8(src1, src2, src3) neon_ld2s_8((__int8*)src1, src2, src3) #define vld2_lane_s16(src1, src2, src3) neon_ld2s_16((__int16*)src1, src2, src3) #define vld2_lane_s32(src1, src2, src3) neon_ld2s_32((__int32*)src1, src2, src3) #define vld2_lane_s64(src1, src2, src3) neon_ld2s_64((__int64*)src1, src2, src3) #define vld2_lane_s8(src1, src2, src3) neon_ld2s_8((__int8*)src1, src2, src3) #define vld2_lane_u16(src1, src2, src3) neon_ld2s_16((__int16*)src1, src2, src3) #define vld2_lane_u32(src1, src2, src3) neon_ld2s_32((__int32*)src1, src2, src3) #define vld2_lane_u8(src1, src2, src3) neon_ld2s_8((__int8*)src1, src2, src3) #define vld2q_lane_f32(src1, src2, src3) neon_ld2s_q32((__int32*)src1, src2, src3) #define vld2q_lane_p8(src1, src2, src3) neon_ld2s_q8((__int8*)src1, src2, src3) #define vld2q_lane_p16(src1, src2, src3) neon_ld2s_q16((__int16*)src1, src2, src3) #define vld2q_lane_s16(src1, src2, src3) neon_ld2s_q16((__int16*)src1, src2, src3) #define vld2q_lane_s32(src1, src2, src3) neon_ld2s_q32((__int32*)src1, src2, src3) #define vld2q_lane_s64(src1, src2, src3) neon_ld2s_q64((__int64*)src1, src2, src3) #define vld2q_lane_u16(src1, src2, src3) neon_ld2s_q16((__int16*)src1, src2, src3) #define vld2q_lane_u32(src1, src2, src3) neon_ld2s_q32((__int32*)src2, src3) #define vld1_dup_f32(src) neon_ld1r_32((__int32*)src) #define vld1_dup_p16(src) neon_ld1r_16((__int16*)src) #define vld1_dup_p8(src) neon_ld1r_8((__int8*)src) #define vld1_dup_s16(src) neon_ld1r_16((__int16*)src) #define vld1_dup_s32(src) neon_ld1r_32((__int32*)src) #define vld1_dup_s8(src) neon_ld1r_8((__int8*)src) #define vld1_dup_u16(src) neon_ld1r_16((__int16*)src) #define vld1_dup_u32(src) neon_ld1r_32((__int32*)src) #define vld1_dup_u8(src) neon_ld1r_8((__int8*)src) #define vld1_dup_s64(src) neon_ld1r_64((__int64*)src) #define vld1_dup_u64(src) neon_ld1r_64((__int64*)src) #define vld1_f32(src) neon_ld1m_32((__int32*)src) #define vld1_p16(src) neon_ld1m_16((__int16*)src) #define vld1_p8(src) neon_ld1m_8((__int8*)src) #define vld1_s16(src) neon_ld1m_16((__int16*)src) #define vld1_s32(src) neon_ld1m_32((__int32*)src) #define vld1_s8(src) neon_ld1m_8((__int8*)src) #define vld1_u16(src) neon_ld1m_16((__int16*)src) #define vld1_u32(src) neon_ld1m_32((__int32*)src) #define vld1_u8(src) neon_ld1m_8((__int8*)src) #define vld1_s64(src) neon_ld1m_64((__int64*)src) #define vld1_u64(src) neon_ld1m_64((__int64*)src) #define vld1_f64(src) neon_ld1m_64((__int64*)src) #define vld1_f32_x2(src) neon_ld1m2_32((__int32*)src) #define vld1_p16_x2(src) neon_ld1m2_16((__int16*)src) #define vld1_p8_x2(src) neon_ld1m2_8((__int8*)src) #define vld1_s16_x2(src) neon_ld1m2_16((__int16*)src) #define vld1_s32_x2(src) neon_ld1m2_32((__int32*)src) #define vld1_s8_x2(src) neon_ld1m2_8((__int8*)src) #define vld1_u16_x2(src) neon_ld1m2_16((__int16*)src) #define vld1_u32_x2(src) neon_ld1m2_32((__int32*)src) #define vld1_u8_x2(src) neon_ld1m2_8((__int8*)src) #define vld1_s64_x2(src) neon_ld1m2_64((__int64*)src) #define vld1_u64_x2(src) neon_ld1m2_64((__int64*)src) #define vld1_f64_x2(src) neon_ld1m2_64((__int64*)src) #define vld1_f32_x3(src) neon_ld1m3_32((__int32*)src) #define vld1_p16_x3(src) neon_ld1m3_16((__int16*)src) #define vld1_p8_x3(src) neon_ld1m3_8((__int8*)src) #define vld1_s16_x3(src) neon_ld1m3_16((__int16*)src) #define vld1_s32_x3(src) neon_ld1m3_32((__int32*)src) #define vld1_s8_x3(src) neon_ld1m3_8((__int8*)src) #define vld1_u16_x3(src) neon_ld1m3_16((__int16*)src) #define vld1_u32_x3(src) neon_ld1m3_32((__int32*)src) #define vld1_u8_x3(src) neon_ld1m3_8((__int8*)src) #define vld1_s64_x3(src) neon_ld1m3_64((__int64*)src) #define vld1_u64_x3(src) neon_ld1m3_64((__int64*)src) #define vld1_f64_x3(src) neon_ld1m3_64((__int64*)src) #define vld1_f32_x4(src) neon_ld1m4_32((__int32*)src) #define vld1_p16_x4(src) neon_ld1m4_16((__int16*)src) #define vld1_p8_x4(src) neon_ld1m4_8((__int8*)src) #define vld1_s16_x4(src) neon_ld1m4_16((__int16*)src) #define vld1_s32_x4(src) neon_ld1m4_32((__int32*)src) #define vld1_s8_x4(src) neon_ld1m4_8((__int8*)src) #define vld1_u16_x4(src) neon_ld1m4_16((__int16*)src) #define vld1_u32_x4(src) neon_ld1m4_32((__int32*)src) #define vld1_u8_x4(src) neon_ld1m4_8((__int8*)src) #define vld1_s64_x4(src) neon_ld1m4_64((__int64*)src) #define vld1_u64_x4(src) neon_ld1m4_64((__int64*)src) #define vld1_f64_x4(src) neon_ld1m4_64((__int64*)src) #define vld1q_dup_f32(src) neon_ld1r_q32((__int32*)src) #define vld1q_dup_p16(src) neon_ld1r_q16((__int16*)src) #define vld1q_dup_p8(src) neon_ld1r_q8((__int8*)src) #define vld1q_dup_s16(src) neon_ld1r_q16((__int16*)src) #define vld1q_dup_s32(src) neon_ld1r_q32((__int32*)src) #define vld1q_dup_s8(src) neon_ld1r_q8((__int8*)src) #define vld1q_dup_u16(src) neon_ld1r_q16((__int16*)src) #define vld1q_dup_u32(src) neon_ld1r_q32((__int32*)src) #define vld1q_dup_u8(src) neon_ld1r_q8((__int8*)src) #define vld1q_dup_s64(src) neon_ld1r_q64((__int64*)src) #define vld1q_dup_u64(src) neon_ld1r_q64((__int64*)src) #define vld1q_f32(src) neon_ld1m_q32((__int32*)src) #define vld1q_p16(src) neon_ld1m_q16((__int16*)src) #define vld1q_p8(src) neon_ld1m_q8((__int8*)src) #define vld1q_s16(src) neon_ld1m_q16((__int16*)src) #define vld1q_s32(src) neon_ld1m_q32((__int32*)src) #define vld1q_s8(src) neon_ld1m_q8((__int8*)src) #define vld1q_u16(src) neon_ld1m_q16((__int16*)src) #define vld1q_u32(src) neon_ld1m_q32((__int32*)src) #define vld1q_u8(src) neon_ld1m_q8((__int8*)src) #define vld1q_s64(src) neon_ld1m_q64((__int64*)src) #define vld1q_u64(src) neon_ld1m_q64((__int64*)src) #define vld1q_f32_x2(src) neon_ld1m2_q32((__int32*)src) #define vld1q_p16_x2(src) neon_ld1m2_q16((__int16*)src) #define vld1q_p8_x2(src) neon_ld1m2_q8((__int8*)src) #define vld1q_s16_x2(src) neon_ld1m2_q16((__int16*)src) #define vld1q_s32_x2(src) neon_ld1m2_q32((__int32*)src) #define vld1q_s8_x2(src) neon_ld1m2_q8((__int8*)src) #define vld1q_u16_x2(src) neon_ld1m2_q16((__int16*)src) #define vld1q_u32_x2(src) neon_ld1m2_q32((__int32*)src) #define vld1q_u8_x2(src) neon_ld1m2_q8((__int8*)src) #define vld1q_s64_x2(src) neon_ld1m2_q64((__int64*)src) #define vld1q_u64_x2(src) neon_ld1m2_q64((__int64*)src) #define vld1q_f32_x3(src) neon_ld1m3_q32((__int32*)src) #define vld1q_p16_x3(src) neon_ld1m3_q16((__int16*)src) #define vld1q_p8_x3(src) neon_ld1m3_q8((__int8*)src) #define vld1q_s16_x3(src) neon_ld1m3_q16((__int16*)src) #define vld1q_s32_x3(src) neon_ld1m3_q32((__int32*)src) #define vld1q_s8_x3(src) neon_ld1m3_q8((__int8*)src) #define vld1q_u16_x3(src) neon_ld1m3_q16((__int16*)src) #define vld1q_u32_x3(src) neon_ld1m3_q32((__int32*)src) #define vld1q_u8_x3(src) neon_ld1m3_q8((__int8*)src) #define vld1q_s64_x3(src) neon_ld1m3_q64((__int64*)src) #define vld1q_u64_x3(src) neon_ld1m3_q64((__int64*)src) #define vld1q_f32_x4(src) neon_ld1m4_q32((__int32*)src) #define vld1q_p16_x4(src) neon_ld1m4_q16((__int16*)src) #define vld1q_p8_x4(src) neon_ld1m4_q8((__int8*)src) #define vld1q_s16_x4(src) neon_ld1m4_q16((__int16*)src) #define vld1q_s32_x4(src) neon_ld1m4_q32((__int32*)src) #define vld1q_s8_x4(src) neon_ld1m4_q8((__int8*)src) #define vld1q_u16_x4(src) neon_ld1m4_q16((__int16*)src) #define vld1q_u32_x4(src) neon_ld1m4_q32((__int32*)src) #define vld1q_u8_x4(src) neon_ld1m4_q8((__int8*)src) #define vld1q_s64_x4(src) neon_ld1m4_q64((__int64*)src) #define vld1q_u64_x4(src) neon_ld1m4_q64((__int64*)src) #define vld1_lane_f32(src1, src2, src3) neon_ld1s_32((__int32*)src1, src2, src3) #define vld1_lane_p16(src1, src2, src3) neon_ld1s_16((__int16*)src1, src2, src3) #define vld1_lane_p8(src1, src2, src3) neon_ld1s_8((__int8*)src1, src2, src3) #define vld1_lane_s16(src1, src2, src3) neon_ld1s_16((__int16*)src1, src2, src3) #define vld1_lane_s32(src1, src2, src3) neon_ld1s_32((__int32*)src1, src2, src3) #define vld1_lane_s64(src1, src2, src3) neon_ld1s_64((__int64*)src1, src2, src3) #define vld1_lane_s8(src1, src2, src3) neon_ld1s_8((__int8*)src1, src2, src3) #define vld1_lane_u16(src1, src2, src3) neon_ld1s_16((__int16*)src1, src2, src3) #define vld1_lane_u32(src1, src2, src3) neon_ld1s_32((__int32*)src1, src2, src3) #define vld1_lane_u8(src1, src2, src3) neon_ld1s_8((__int8*)src1, src2, src3) #define vld1q_lane_f32(src1, src2, src3) neon_ld1s_q32((__int32*)src1, src2, src3) #define vld1q_lane_p8(src1, src2, src3) neon_ld1s_q8((__int8*)src1, src2, src3) #define vld1q_lane_p16(src1, src2, src3) neon_ld1s_q16((__int16*)src1, src2, src3) #define vld1q_lane_s16(src1, src2, src3) neon_ld1s_q16((__int16*)src1, src2, src3) #define vld1q_lane_s32(src1, src2, src3) neon_ld1s_q32((__int32*)src1, src2, src3) #define vld1q_lane_s64(src1, src2, src3) neon_ld1s_q64((__int64*)src1, src2, src3) #define vld1q_lane_u16(src1, src2, src3) neon_ld1s_q16((__int16*)src1, src2, src3) #define vld1q_lane_u32(src1, src2, src3) neon_ld1s_q32((__int32*)src1, src2, src3) #define vld4_dup_f32_ex(src, align) neon_ld4r_32((__int32*)src) #define vld4_dup_p16_ex(src, align) neon_ld4r_16((__int16*)src) #define vld4_dup_p8_ex(src, align) neon_ld4r_8((__int8*)src) #define vld4_dup_s16_ex(src, align) neon_ld4r_16((__int16*)src) #define vld4_dup_s32_ex(src, align) neon_ld4r_32((__int32*)src) #define vld4_dup_s8_ex(src, align) neon_ld4r_8((__int8*)src) #define vld4_dup_u16_ex(src, align) neon_ld4r_16((__int16*)src) #define vld4_dup_u32_ex(src, align) neon_ld4r_32((__int32*)src) #define vld4_dup_u8_ex(src, align) neon_ld4r_8((__int8*)src) #define vld4_dup_s64_ex(src, align) neon_ld4r_64((__int64*)src) #define vld4_dup_u64_ex(src, align) neon_ld4r_64((__int64*)src) #define vld4_f32_ex(src, align) neon_ld4m_32((__int32*)src) #define vld4_p16_ex(src, align) neon_ld4m_16((__int16*)src) #define vld4_p8_ex(src, align) neon_ld4m_8((__int8*)src) #define vld4_s16_ex(src, align) neon_ld4m_16((__int16*)src) #define vld4_s32_ex(src, align) neon_ld4m_32((__int32*)src) #define vld4_s8_ex(src, align) neon_ld4m_8((__int8*)src) #define vld4_u16_ex(src, align) neon_ld4m_16((__int16*)src) #define vld4_u32_ex(src, align) neon_ld4m_32((__int32*)src) #define vld4_u8_ex(src, align) neon_ld4m_8((__int8*)src) #define vld4_s64_ex(src, align) neon_ld1m4_64((__int64*)src) #define vld4_u64_ex(src, align) neon_ld1m4_64((__int64*)src) #define vld4q_dup_f32_ex(src, align) neon_ld4r_q32((__int32*)src) #define vld4q_dup_p16_ex(src, align) neon_ld4r_q16((__int16*)src) #define vld4q_dup_p8_ex(src, align) neon_ld4r_q8((__int8*)src) #define vld4q_dup_s16_ex(src, align) neon_ld4r_q16((__int16*)src) #define vld4q_dup_s32_ex(src, align) neon_ld4r_q32((__int32*)src) #define vld4q_dup_s8_ex(src, align) neon_ld4r_q8((__int8*)src) #define vld4q_dup_u16_ex(src, align) neon_ld4r_q16((__int16*)src) #define vld4q_dup_u32_ex(src, align) neon_ld4r_q32((__int32*)src) #define vld4q_dup_u8_ex(src, align) neon_ld4r_q8((__int8*)src) #define vld4q_dup_s64_ex(src, align) neon_ld4r_q64((__int64*)src) #define vld4q_dup_u64_ex(src, align) neon_ld4r_q64((__int64*)src) #define vld4q_f32_ex(src, align) neon_ld4m_q32((__int32*)src) #define vld4q_p16_ex(src, align) neon_ld4m_q16((__int16*)src) #define vld4q_p8_ex(src, align) neon_ld4m_q8((__int8*)src) #define vld4q_s16_ex(src, align) neon_ld4m_q16((__int16*)src) #define vld4q_s32_ex(src, align) neon_ld4m_q32((__int32*)src) #define vld4q_s8_ex(src, align) neon_ld4m_q8((__int8*)src) #define vld4q_u16_ex(src, align) neon_ld4m_q16((__int16*)src) #define vld4q_u32_ex(src, align) neon_ld4m_q32((__int32*)src) #define vld4q_u8_ex(src, align) neon_ld4m_q8((__int8*)src) #define vld4q_s64_ex(src, align) neon_ld4m_q64((__int64*)src) #define vld4q_u64_ex(src, align) neon_ld4m_q64((__int64*)src) #define vld4_lane_f32_ex(src1, src2, src3) neon_ld4s_32((__int32*)src1, src2, src3) #define vld4_lane_p16_ex(src1, src2, src3) neon_ld4s_16((__int16*)src1, src2, src3) #define vld4_lane_p8_ex(src1, src2, src3) neon_ld4s_8((__int8*)src1, src2, src3) #define vld4_lane_s16_ex(src1, src2, src3) neon_ld4s_16((__int16*)src1, src2, src3) #define vld4_lane_s32_ex(src1, src2, src3) neon_ld4s_32((__int32*)src1, src2, src3) #define vld4_lane_s64_ex(src1, src2, src3) neon_ld4s_64((__int64*)src1, src2, src3) #define vld4_lane_s8_ex(src1, src2, src3) neon_ld4s_8((__int8*)src1, src2, src3) #define vld4_lane_u16_ex(src1, src2, src3) neon_ld4s_16((__int16*)src1, src2, src3) #define vld4_lane_u32_ex(src1, src2, src3) neon_ld4s_32((__int32*)src1, src2, src3) #define vld4_lane_u8_ex(src1, src2, src3) neon_ld4s_8((__int8*)src1, src2, src3) #define vld4q_lane_f32_ex(src1, src2, src3) neon_ld4s_q32((__int32*)src1, src2, src3) #define vld4q_lane_p8_ex(src1, src2, src3) neon_ld4s_q8((__int8*)src1, src2, src3) #define vld4q_lane_p16_ex(src1, src2, src3) neon_ld4s_q16((__int16*)src1, src2, src3) #define vld4q_lane_s16_ex(src1, src2, src3) neon_ld4s_q16((__int16*)src1, src2, src3) #define vld4q_lane_s32_ex(src1, src2, src3) neon_ld4s_q32((__int32*)src1, src2, src3) #define vld4q_lane_s64_ex(src1, src2, src3) neon_ld4s_q64((__int64*)src1, src2, src3) #define vld4q_lane_u16_ex(src1, src2, src3) neon_ld4s_q16((__int16*)src1, src2, src3) #define vld4q_lane_u32_ex(src1, src2, src3) neon_ld4s_q32((__int32*)src1, src2, src3) #define vld3_dup_f32_ex(src, align) neon_ld3r_32((__int32*)src) #define vld3_dup_p16_ex(src, align) neon_ld3r_16((__int16*)src) #define vld3_dup_p8_ex(src, align) neon_ld3r_8((__int8*)src) #define vld3_dup_s16_ex(src, align) neon_ld3r_16((__int16*)src) #define vld3_dup_s32_ex(src, align) neon_ld3r_32((__int32*)src) #define vld3_dup_s8_ex(src, align) neon_ld3r_8((__int8*)src) #define vld3_dup_u16_ex(src, align) neon_ld3r_16((__int16*)src) #define vld3_dup_u32_ex(src, align) neon_ld3r_32((__int32*)src) #define vld3_dup_u8_ex(src, align) neon_ld3r_8((__int8*)src) #define vld3_dup_s64_ex(src, align) neon_ld3r_64((__int64*)src) #define vld3_dup_u64_ex(src, align) neon_ld3r_64((__int64*)src) #define vld3_f32_ex(src, align) neon_ld3m_32((__int32*)src) #define vld3_p16_ex(src, align) neon_ld3m_16((__int16*)src) #define vld3_p8_ex(src, align) neon_ld3m_8((__int8*)src) #define vld3_s16_ex(src, align) neon_ld3m_16((__int16*)src) #define vld3_s32_ex(src, align) neon_ld3m_32((__int32*)src) #define vld3_s8_ex(src, align) neon_ld3m_8((__int8*)src) #define vld3_u16_ex(src, align) neon_ld3m_16((__int16*)src) #define vld3_u32_ex(src, align) neon_ld3m_32((__int32*)src) #define vld3_u8_ex(src, align) neon_ld3m_8((__int8*)src) #define vld3_s64_ex(src, align) neon_ld1m3_64((__int64*)src) #define vld3_u64_ex(src, align) neon_ld1m3_64((__int64*)src) #define vld3q_dup_f32_ex(src, align) neon_ld3r_q32((__int32*)src) #define vld3q_dup_p16_ex(src, align) neon_ld3r_q16((__int16*)src) #define vld3q_dup_p8_ex(src, align) neon_ld3r_q8((__int8*)src) #define vld3q_dup_s16_ex(src, align) neon_ld3r_q16((__int16*)src) #define vld3q_dup_s32_ex(src, align) neon_ld3r_q32((__int32*)src) #define vld3q_dup_s8_ex(src, align) neon_ld3r_q8((__int8*)src) #define vld3q_dup_u16_ex(src, align) neon_ld3r_q16((__int16*)src) #define vld3q_dup_u32_ex(src, align) neon_ld3r_q32((__int32*)src) #define vld3q_dup_u8_ex(src, align) neon_ld3r_q8((__int8*)src) #define vld3q_dup_s64_ex(src, align) neon_ld3r_q64((__int64*)src) #define vld3q_dup_u64_ex(src, align) neon_ld3r_q64((__int64*)src) #define vld3q_f32_ex(src, align) neon_ld3m_q32((__int32*)src) #define vld3q_p16_ex(src, align) neon_ld3m_q16((__int16*)src) #define vld3q_p8_ex(src, align) neon_ld3m_q8((__int8*)src) #define vld3q_s16_ex(src, align) neon_ld3m_q16((__int16*)src) #define vld3q_s32_ex(src, align) neon_ld3m_q32((__int32*)src) #define vld3q_s8_ex(src, align) neon_ld3m_q8((__int8*)src) #define vld3q_u16_ex(src, align) neon_ld3m_q16((__int16*)src) #define vld3q_u32_ex(src, align) neon_ld3m_q32((__int32*)src) #define vld3q_u8_ex(src, align) neon_ld3m_q8((__int8*)src) #define vld3q_s64_ex(src, align) neon_ld3m_q64((__int64*)src) #define vld3q_u64_ex(src, align) neon_ld3m_q64((__int64*)src) #define vld3_lane_f32_ex(src1, src2, src3) neon_ld3s_32((__int32*)src1, src2, src3) #define vld3_lane_p16_ex(src1, src2, src3) neon_ld3s_16((__int16*)src1, src2, src3) #define vld3_lane_p8_ex(src1, src2, src3) neon_ld3s_8((__int8*)src1, src2, src3) #define vld3_lane_s16_ex(src1, src2, src3) neon_ld3s_16((__int16*)src1, src2, src3) #define vld3_lane_s32_ex(src1, src2, src3) neon_ld3s_32((__int32*)src1, src2, src3) #define vld3_lane_s64_ex(src1, src2, src3) neon_ld3s_64((__int64*)src1, src2, src3) #define vld3_lane_s8_ex(src1, src2, src3) neon_ld3s_8((__int8*)src1, src2, src3) #define vld3_lane_u16_ex(src1, src2, src3) neon_ld3s_16((__int16*)src1, src2, src3) #define vld3_lane_u32_ex(src1, src2, src3) neon_ld3s_32((__int32*)src1, src2, src3) #define vld3_lane_u8_ex(src1, src2, src3) neon_ld3s_8((__int8*)src1, src2, src3) #define vld3q_lane_f32_ex(src1, src2, src3) neon_ld3s_q32((__int32*)src1, src2, src3) #define vld3q_lane_p8_ex(src1, src2, src3) neon_ld3s_q8((__int8*)src1, src2, src3) #define vld3q_lane_p16_ex(src1, src2, src3) neon_ld3s_q16((__int16*)src1, src2, src3) #define vld3q_lane_s16_ex(src1, src2, src3) neon_ld3s_q16((__int16*)src1, src2, src3) #define vld3q_lane_s32_ex(src1, src2, src3) neon_ld3s_q32((__int32*)src1, src2, src3) #define vld3q_lane_s64_ex(src1, src2, src3) neon_ld3s_q64((__int64*)src1, src2, src3) #define vld3q_lane_u16_ex(src1, src2, src3) neon_ld3s_q16((__int16*)src1, src2, src3) #define vld3q_lane_u32_ex(src1, src2, src3) neon_ld3s_q32((__int32*)src1, src2, src3) #define vld2_dup_f32_ex(src, align) neon_ld2r_32((__int32*)src) #define vld2_dup_p16_ex(src, align) neon_ld2r_16((__int16*)src) #define vld2_dup_p8_ex(src, align) neon_ld2r_8((__int8*)src) #define vld2_dup_s16_ex(src, align) neon_ld2r_16((__int16*)src) #define vld2_dup_s32_ex(src, align) neon_ld2r_32((__int32*)src) #define vld2_dup_s8_ex(src, align) neon_ld2r_8((__int8*)src) #define vld2_dup_u16_ex(src, align) neon_ld2r_16((__int16*)src) #define vld2_dup_u32_ex(src, align) neon_ld2r_32((__int32*)src) #define vld2_dup_u8_ex(src, align) neon_ld2r_8((__int8*)src) #define vld2_dup_s64_ex(src, align) neon_ld2r_64((__int64*)src) #define vld2_dup_u64_ex(src, align) neon_ld2r_64((__int64*)src) #define vld2_f32_ex(src, align) neon_ld2m_32((__int32*)src) #define vld2_p16_ex(src, align) neon_ld2m_16((__int16*)src) #define vld2_p8_ex(src, align) neon_ld2m_8((__int8*)src) #define vld2_s16_ex(src, align) neon_ld2m_16((__int16*)src) #define vld2_s32_ex(src, align) neon_ld2m_32((__int32*)src) #define vld2_s8_ex(src, align) neon_ld2m_8((__int8*)src) #define vld2_u16_ex(src, align) neon_ld2m_16((__int16*)src) #define vld2_u32_ex(src, align) neon_ld2m_32((__int32*)src) #define vld2_u8_ex(src, align) neon_ld2m_8((__int8*)src) #define vld2_s64_ex(src, align) neon_ld1m2_64((__int64*)src) #define vld2_u64_ex(src, align) neon_ld1m2_64((__int64*)src) #define vld2q_dup_f32_ex(src, align) neon_ld2r_q32((__int32*)src) #define vld2q_dup_p16_ex(src, align) neon_ld2r_q16((__int16*)src) #define vld2q_dup_p8_ex(src, align) neon_ld2r_q8((__int8*)src) #define vld2q_dup_s16_ex(src, align) neon_ld2r_q16((__int16*)src) #define vld2q_dup_s32_ex(src, align) neon_ld2r_q32((__int32*)src) #define vld2q_dup_s8_ex(src, align) neon_ld2r_q8((__int8*)src) #define vld2q_dup_u16_ex(src, align) neon_ld2r_q16((__int16*)src) #define vld2q_dup_u32_ex(src, align) neon_ld2r_q32((__int32*)src) #define vld2q_dup_u8_ex(src, align) neon_ld2r_q8((__int8*)src) #define vld2q_dup_s64_ex(src, align) neon_ld2r_q64((__int64*)src) #define vld2q_dup_u64_ex(src, align) neon_ld2r_q64((__int64*)src) #define vld2q_f32_ex(src, align) neon_ld2m_q32((__int32*)src) #define vld2q_p16_ex(src, align) neon_ld2m_q16((__int16*)src) #define vld2q_p8_ex(src, align) neon_ld2m_q8((__int8*)src) #define vld2q_s16_ex(src, align) neon_ld2m_q16((__int16*)src) #define vld2q_s32_ex(src, align) neon_ld2m_q32((__int32*)src) #define vld2q_s8_ex(src, align) neon_ld2m_q8((__int8*)src) #define vld2q_u16_ex(src, align) neon_ld2m_q16((__int16*)src) #define vld2q_u32_ex(src, align) neon_ld2m_q32((__int32*)src) #define vld2q_u8_ex(src, align) neon_ld2m_q8((__int8*)src) #define vld2q_s64_ex(src, align) neon_ld2m_q64((__int64*)src) #define vld2q_u64_ex(src, align) neon_ld2m_q64((__int64*)src) #define vld2_lane_f32_ex(src1, src2, src3) neon_ld2s_32((__int32*)src1, src2, src3) #define vld2_lane_p16_ex(src1, src2, src3) neon_ld2s_16((__int16*)src1, src2, src3) #define vld2_lane_p8_ex(src1, src2, src3) neon_ld2s_8((__int8*)src1, src2, src3) #define vld2_lane_s16_ex(src1, src2, src3) neon_ld2s_16((__int16*)src1, src2, src3) #define vld2_lane_s32_ex(src1, src2, src3) neon_ld2s_32((__int32*)src1, src2, src3) #define vld2_lane_s64_ex(src1, src2, src3) neon_ld2s_64((__int64*)src1, src2, src3) #define vld2_lane_s8_ex(src1, src2, src3) neon_ld2s_8((__int8*)src1, src2, src3) #define vld2_lane_u16_ex(src1, src2, src3) neon_ld2s_16((__int16*)src1, src2, src3) #define vld2_lane_u32_ex(src1, src2, src3) neon_ld2s_32((__int32*)src1, src2, src3) #define vld2_lane_u8_ex(src1, src2, src3) neon_ld2s_8((__int8*)src1, src2, src3) #define vld2q_lane_f32_ex(src1, src2, src3) neon_ld2s_q32((__int32*)src1, src2, src3) #define vld2q_lane_p8_ex(src1, src2, src3) neon_ld2s_q8((__int8*)src1, src2, src3) #define vld2q_lane_p16_ex(src1, src2, src3) neon_ld2s_q16((__int16*)src1, src2, src3) #define vld2q_lane_s16_ex(src1, src2, src3) neon_ld2s_q16((__int16*)src1, src2, src3) #define vld2q_lane_s32_ex(src1, src2, src3) neon_ld2s_q32((__int32*)src1, src2, src3) #define vld2q_lane_s64_ex(src1, src2, src3) neon_ld2s_q64((__int64*)src1, src2, src3) #define vld2q_lane_u16_ex(src1, src2, src3) neon_ld2s_q16((__int16*)src1, src2, src3) #define vld2q_lane_u32_ex(src1, src2, src3) neon_ld2s_q32((__int32*)src1, src2, src3) #define vld1_dup_f32_ex(src, align) neon_ld1r_32((__int32*)src) #define vld1_dup_p16_ex(src, align) neon_ld1r_16((__int16*)src) #define vld1_dup_p8_ex(src, align) neon_ld1r_8((__int8*)src) #define vld1_dup_s16_ex(src, align) neon_ld1r_16((__int16*)src) #define vld1_dup_s32_ex(src, align) neon_ld1r_32((__int32*)src) #define vld1_dup_s8_ex(src, align) neon_ld1r_8((__int8*)src) #define vld1_dup_u16_ex(src, align) neon_ld1r_16((__int16*)src) #define vld1_dup_u32_ex(src, align) neon_ld1r_32((__int32*)src) #define vld1_dup_u8_ex(src, align) neon_ld1r_8((__int8*)src) #define vld1_dup_s64_ex(src, align) neon_ld1r_64((__int64*)src) #define vld1_dup_u64_ex(src, align) neon_ld1r_64((__int64*)src) #define vld1_f32_ex(src, align) neon_ld1m_32((__int32*)src) #define vld1_p16_ex(src, align) neon_ld1m_16((__int16*)src) #define vld1_p8_ex(src, align) neon_ld1m_8((__int8*)src) #define vld1_s16_ex(src, align) neon_ld1m_16((__int16*)src) #define vld1_s32_ex(src, align) neon_ld1m_32((__int32*)src) #define vld1_s8_ex(src, align) neon_ld1m_8((__int8*)src) #define vld1_u16_ex(src, align) neon_ld1m_16((__int16*)src) #define vld1_u32_ex(src, align) neon_ld1m_32((__int32*)src) #define vld1_u8_ex(src, align) neon_ld1m_8((__int8*)src) #define vld1_s64_ex(src, align) neon_ld1m_64((__int64*)src) #define vld1_u64_ex(src, align) neon_ld1m_64((__int64*)src) #define vld1_f64_ex(src, align) neon_ld1m_64((__int64*)src) #define vld1_f32_x2_ex(src, align) neon_ld1m2_32((__int32*)src) #define vld1_p16_x2_ex(src, align) neon_ld1m2_16((__int16*)src) #define vld1_p8_x2_ex(src, align) neon_ld1m2_8((__int8*)src) #define vld1_s16_x2_ex(src, align) neon_ld1m2_16((__int16*)src) #define vld1_s32_x2_ex(src, align) neon_ld1m2_32((__int32*)src) #define vld1_s8_x2_ex(src, align) neon_ld1m2_8((__int8*)src) #define vld1_u16_x2_ex(src, align) neon_ld1m2_16((__int16*)src) #define vld1_u32_x2_ex(src, align) neon_ld1m2_32((__int32*)src) #define vld1_u8_x2_ex(src, align) neon_ld1m2_8((__int8*)src) #define vld1_s64_x2_ex(src, align) neon_ld1m2_64((__int64*)src) #define vld1_u64_x2_ex(src, align) neon_ld1m2_64((__int64*)src) #define vld1_f64_x2_ex(src, align) neon_ld1m2_64((__int64*)src) #define vld1_f32_x3_ex(src, align) neon_ld1m3_32((__int32*)src) #define vld1_p16_x3_ex(src, align) neon_ld1m3_16((__int16*)src) #define vld1_p8_x3_ex(src, align) neon_ld1m3_8((__int8*)src) #define vld1_s16_x3_ex(src, align) neon_ld1m3_16((__int16*)src) #define vld1_s32_x3_ex(src, align) neon_ld1m3_32((__int32*)src) #define vld1_s8_x3_ex(src, align) neon_ld1m3_8((__int8*)src) #define vld1_u16_x3_ex(src, align) neon_ld1m3_16((__int16*)src) #define vld1_u32_x3_ex(src, align) neon_ld1m3_32((__int32*)src) #define vld1_u8_x3_ex(src, align) neon_ld1m3_8((__int8*)src) #define vld1_s64_x3_ex(src, align) neon_ld1m3_64((__int64*)src) #define vld1_u64_x3_ex(src, align) neon_ld1m3_64((__int64*)src) #define vld1_f64_x3_ex(src, align) neon_ld1m3_64((__int64*)src) #define vld1_f32_x4_ex(src, align) neon_ld1m4_32((__int32*)src) #define vld1_p16_x4_ex(src, align) neon_ld1m4_16((__int16*)src) #define vld1_p8_x4_ex(src, align) neon_ld1m4_8((__int8*)src) #define vld1_s16_x4_ex(src, align) neon_ld1m4_16((__int16*)src) #define vld1_s32_x4_ex(src, align) neon_ld1m4_32((__int32*)src) #define vld1_s8_x4_ex(src, align) neon_ld1m4_8((__int8*)src) #define vld1_u16_x4_ex(src, align) neon_ld1m4_16((__int16*)src) #define vld1_u32_x4_ex(src, align) neon_ld1m4_32((__int32*)src) #define vld1_u8_x4_ex(src, align) neon_ld1m4_8((__int8*)src) #define vld1_s64_x4_ex(src, align) neon_ld1m4_64((__int64*)src) #define vld1_u64_x4_ex(src, align) neon_ld1m4_64((__int64*)src) #define vld1_f64_x4_ex(src, align) neon_ld1m4_64((__int64*)src) #define vld1q_dup_f32_ex(src, align) neon_ld1r_q32((__int32*)src) #define vld1q_dup_p16_ex(src, align) neon_ld1r_q16((__int16*)src) #define vld1q_dup_p8_ex(src, align) neon_ld1r_q8((__int8*)src) #define vld1q_dup_s16_ex(src, align) neon_ld1r_q16((__int16*)src) #define vld1q_dup_s32_ex(src, align) neon_ld1r_q32((__int32*)src) #define vld1q_dup_s8_ex(src, align) neon_ld1r_q8((__int8*)src) #define vld1q_dup_u16_ex(src, align) neon_ld1r_q16((__int16*)src) #define vld1q_dup_u32_ex(src, align) neon_ld1r_q32((__int32*)src) #define vld1q_dup_u8_ex(src, align) neon_ld1r_q8((__int8*)src) #define vld1q_dup_s64_ex(src, align) neon_ld1r_q64((__int64*)src) #define vld1q_dup_u64_ex(src, align) neon_ld1r_q64((__int64*)src) #define vld1q_f32_ex(src, align) neon_ld1m_q32((__int32*)src) #define vld1q_p16_ex(src, align) neon_ld1m_q16((__int16*)src) #define vld1q_p8_ex(src, align) neon_ld1m_q8((__int8*)src) #define vld1q_s16_ex(src, align) neon_ld1m_q16((__int16*)src) #define vld1q_s32_ex(src, align) neon_ld1m_q32((__int32*)src) #define vld1q_s8_ex(src, align) neon_ld1m_q8((__int8*)src) #define vld1q_u16_ex(src, align) neon_ld1m_q16((__int16*)src) #define vld1q_u32_ex(src, align) neon_ld1m_q32((__int32*)src) #define vld1q_u8_ex(src, align) neon_ld1m_q8((__int8*)src) #define vld1q_s64_ex(src, align) neon_ld1m_q64((__int64*)src) #define vld1q_u64_ex(src, align) neon_ld1m_q64((__int64*)src) #define vld1q_f32_x2_ex(src, align) neon_ld1m2_q32((__int32*)src) #define vld1q_p16_x2_ex(src, align) neon_ld1m2_q16((__int16*)src) #define vld1q_p8_x2_ex(src, align) neon_ld1m2_q8((__int8*)src) #define vld1q_s16_x2_ex(src, align) neon_ld1m2_q16((__int16*)src) #define vld1q_s32_x2_ex(src, align) neon_ld1m2_q32((__int32*)src) #define vld1q_s8_x2_ex(src, align) neon_ld1m2_q8((__int8*)src) #define vld1q_u16_x2_ex(src, align) neon_ld1m2_q16((__int16*)src) #define vld1q_u32_x2_ex(src, align) neon_ld1m2_q32((__int32*)src) #define vld1q_u8_x2_ex(src, align) neon_ld1m2_q8((__int8*)src) #define vld1q_s64_x2_ex(src, align) neon_ld1m2_q64((__int64*)src) #define vld1q_u64_x2_ex(src, align) neon_ld1m2_q64((__int64*)src) #define vld1q_f32_x3_ex(src, align) neon_ld1m3_q32((__int32*)src) #define vld1q_p16_x3_ex(src, align) neon_ld1m3_q16((__int16*)src) #define vld1q_p8_x3_ex(src, align) neon_ld1m3_q8((__int8*)src) #define vld1q_s16_x3_ex(src, align) neon_ld1m3_q16((__int16*)src) #define vld1q_s32_x3_ex(src, align) neon_ld1m3_q32((__int32*)src) #define vld1q_s8_x3_ex(src, align) neon_ld1m3_q8((__int8*)src) #define vld1q_u16_x3_ex(src, align) neon_ld1m3_q16((__int16*)src) #define vld1q_u32_x3_ex(src, align) neon_ld1m3_q32((__int32*)src) #define vld1q_u8_x3_ex(src, align) neon_ld1m3_q8((__int8*)src) #define vld1q_s64_x3_ex(src, align) neon_ld1m3_q64((__int64*)src) #define vld1q_u64_x3_ex(src, align) neon_ld1m3_q64((__int64*)src) #define vld1q_f32_x4_ex(src, align) neon_ld1m4_q32((__int32*)src) #define vld1q_p16_x4_ex(src, align) neon_ld1m4_q16((__int16*)src) #define vld1q_p8_x4_ex(src, align) neon_ld1m4_q8((__int8*)src) #define vld1q_s16_x4_ex(src, align) neon_ld1m4_q16((__int16*)src) #define vld1q_s32_x4_ex(src, align) neon_ld1m4_q32((__int32*)src) #define vld1q_s8_x4_ex(src, align) neon_ld1m4_q8((__int8*)src) #define vld1q_u16_x4_ex(src, align) neon_ld1m4_q16((__int16*)src) #define vld1q_u32_x4_ex(src, align) neon_ld1m4_q32((__int32*)src) #define vld1q_u8_x4_ex(src, align) neon_ld1m4_q8((__int8*)src) #define vld1q_s64_x4_ex(src, align) neon_ld1m4_q64((__int64*)src) #define vld1q_u64_x4_ex(src, align) neon_ld1m4_q64((__int64*)src) #define vld1_lane_f32_ex(src1, src2, src3) neon_ld1s_32((__int32*)src1, src2, src3) #define vld1_lane_p16_ex(src1, src2, src3) neon_ld1s_16((__int16*)src1, src2, src3) #define vld1_lane_p8_ex(src1, src2, src3) neon_ld1s_8((__int8*)src1, src2, src3) #define vld1_lane_s16_ex(src1, src2, src3) neon_ld1s_16((__int16*)src1, src2, src3) #define vld1_lane_s32_ex(src1, src2, src3) neon_ld1s_32((__int32*)src1, src2, src3) #define vld1_lane_s64_ex(src1, src2, src3) neon_ld1s_64((__int64*)src1, src2, src3) #define vld1_lane_s8_ex(src1, src2, src3) neon_ld1s_8((__int8*)src1, src2, src3) #define vld1_lane_u16_ex(src1, src2, src3) neon_ld1s_16((__int16*)src1, src2, src3) #define vld1_lane_u32_ex(src1, src2, src3) neon_ld1s_32((__int32*)src1, src2, src3) #define vld1_lane_u8_ex(src1, src2, src3) neon_ld1s_8((__int8*)src1, src2, src3) #define vld1q_lane_f32_ex(src1, src2, src3) neon_ld1s_q32((__int32*)src1, src2, src3) #define vld1q_lane_p8_ex(src1, src2, src3) neon_ld1s_q8((__int8*)src1, src2, src3) #define vld1q_lane_p16_ex(src1, src2, src3) neon_ld1s_q16((__int16*)src1, src2, src3) #define vld1q_lane_s16_ex(src1, src2, src3) neon_ld1s_q16((__int16*)src1, src2, src3) #define vld1q_lane_s32_ex(src1, src2, src3) neon_ld1s_q32((__int32*)src1, src2, src3) #define vld1q_lane_s64_ex(src1, src2, src3) neon_ld1s_q64((__int64*)src1, src2, src3) #define vld1q_lane_u16_ex(src1, src2, src3) neon_ld1s_q16((__int16*)src1, src2, src3) #define vld1q_lane_u32_ex(src1, src2, src3) neon_ld1s_q32((__int32*)src1, src2, src3) // ST1/ST2/ST3/ST4 void neon_st4m_8(__int8 * ptr, __n64x4 src); void neon_st4m_q8(__int8 * ptr, __n128x4 src); void neon_st4m_16(__int16 * ptr, __n64x4 src); void neon_st4m_q16(__int16 * ptr, __n128x4 src); void neon_st4m_32(__int32 * ptr, __n64x4 src); void neon_st4m_q32(__int32 * ptr, __n128x4 src); void neon_st4m_q64(__int64 * ptr, __n128x4 src); void neon_st4s_8(__int8 * ptr, __n64x4 src, const int lane); void neon_st4s_q8(__int8 * ptr, __n128x4 src, const int lane); void neon_st4s_16(__int16 * ptr, __n64x4 src, const int lane); void neon_st4s_q16(__int16 * ptr, __n128x4 src, const int lane); void neon_st4s_32(__int32 * ptr, __n64x4 src, const int lane); void neon_st4s_q32(__int32 * ptr, __n128x4 src, const int lane); void neon_st4s_64(__int64 * ptr, __n64x4 src, const int lane); void neon_st4s_q64(__int64 * ptr, __n128x4 src, const int lane); void neon_st3m_8(__int8 * ptr, __n64x3 src); void neon_st3m_q8(__int8 * ptr, __n128x3 src); void neon_st3m_16(__int16 * ptr, __n64x3 src); void neon_st3m_q16(__int16 * ptr, __n128x3 src); void neon_st3m_32(__int32 * ptr, __n64x3 src); void neon_st3m_q32(__int32 * ptr, __n128x3 src); void neon_st3m_q64(__int64 * ptr, __n128x3 src); void neon_st3s_8(__int8 * ptr, __n64x3 src, const int lane); void neon_st3s_q8(__int8 * ptr, __n128x3 src, const int lane); void neon_st3s_16(__int16 * ptr, __n64x3 src, const int lane); void neon_st3s_q16(__int16 * ptr, __n128x3 src, const int lane); void neon_st3s_32(__int32 * ptr, __n64x3 src, const int lane); void neon_st3s_q32(__int32 * ptr, __n128x3 src, const int lane); void neon_st3s_64(__int64 * ptr, __n64x3 src, const int lane); void neon_st3s_q64(__int64 * ptr, __n128x3 src, const int lane); void neon_st2m_8(__int8 * ptr, __n64x2 src); void neon_st2m_q8(__int8 * ptr, __n128x2 src); void neon_st2m_16(__int16 * ptr, __n64x2 src); void neon_st2m_q16(__int16 * ptr, __n128x2 src); void neon_st2m_32(__int32 * ptr, __n64x2 src); void neon_st2m_q32(__int32 * ptr, __n128x2 src); void neon_st2m_q64(__int64 * ptr, __n128x2 src); void neon_st2s_8(__int8 * ptr, __n64x2 src, const int lane); void neon_st2s_q8(__int8 * ptr, __n128x2 src, const int lane); void neon_st2s_16(__int16 * ptr, __n64x2 src, const int lane); void neon_st2s_q16(__int16 * ptr, __n128x2 src, const int lane); void neon_st2s_32(__int32 * ptr, __n64x2 src, const int lane); void neon_st2s_q32(__int32 * ptr, __n128x2 src, const int lane); void neon_st2s_64(__int64 * ptr, __n64x2 src, const int lane); void neon_st2s_q64(__int64 * ptr, __n128x2 src, const int lane); void neon_st1m_8(__int8 * ptr, __n64 src); void neon_st1m_q8(__int8 * ptr, __n128 src); void neon_st1m_16(__int16 * ptr, __n64 src); void neon_st1m_q16(__int16 * ptr, __n128 src); void neon_st1m_32(__int32 * ptr, __n64 src); void neon_st1m_q32(__int32 * ptr, __n128 src); void neon_st1m_64(__int64 * ptr, __n64 src); void neon_st1m_q64(__int64 * ptr, __n128 src); void neon_st1m2_8(__int8 * ptr, __n64x2 src); void neon_st1m2_q8(__int8 * ptr, __n128x2 src); void neon_st1m2_16(__int16 * ptr, __n64x2 src); void neon_st1m2_q16(__int16 * ptr, __n128x2 src); void neon_st1m2_32(__int32 * ptr, __n64x2 src); void neon_st1m2_q32(__int32 * ptr, __n128x2 src); void neon_st1m2_64(__int64 * ptr, __n64x2 src); void neon_st1m2_q64(__int64 * ptr, __n128x2 src); void neon_st1m3_8(__int8 * ptr, __n64x3 src); void neon_st1m3_q8(__int8 * ptr, __n128x3 src); void neon_st1m3_16(__int16 * ptr, __n64x3 src); void neon_st1m3_q16(__int16 * ptr, __n128x3 src); void neon_st1m3_32(__int32 * ptr, __n64x3 src); void neon_st1m3_q32(__int32 * ptr, __n128x3 src); void neon_st1m3_64(__int64 * ptr, __n64x3 src); void neon_st1m3_q64(__int64 * ptr, __n128x3 src); void neon_st1m4_8(__int8 * ptr, __n64x4 src); void neon_st1m4_q8(__int8 * ptr, __n128x4 src); void neon_st1m4_16(__int16 * ptr, __n64x4 src); void neon_st1m4_q16(__int16 * ptr, __n128x4 src); void neon_st1m4_32(__int32 * ptr, __n64x4 src); void neon_st1m4_q32(__int32 * ptr, __n128x4 src); void neon_st1m4_64(__int64 * ptr, __n64x4 src); void neon_st1m4_q64(__int64 * ptr, __n128x4 src); void neon_st1s_8(__int8 * ptr, __n64 src, const int lane); void neon_st1s_q8(__int8 * ptr, __n128 src, const int lane); void neon_st1s_16(__int16 * ptr, __n64 src, const int lane); void neon_st1s_q16(__int16 * ptr, __n128 src, const int lane); void neon_st1s_32(__int32 * ptr, __n64 src, const int lane); void neon_st1s_q32(__int32 * ptr, __n128 src, const int lane); void neon_st1s_64(__int64 * ptr, __n64 src, const int lane); void neon_st1s_q64(__int64 * ptr, __n128 src, const int lane); #define vst4_f32(src1, src2) neon_st4m_32((__int32*)src1, src2) #define vst4_p16(src1, src2) neon_st4m_16((__int16*)src1, src2) #define vst4_p8(src1, src2) neon_st4m_8((__int8*)src1, src2) #define vst4_s16(src1, src2) neon_st4m_16((__int16*)src1, src2) #define vst4_s32(src1, src2) neon_st4m_32((__int32*)src1, src2) #define vst4_s8(src1, src2) neon_st4m_8((__int8*)src1, src2) #define vst4_u16(src1, src2) neon_st4m_16((__int16*)src1, src2) #define vst4_u32(src1, src2) neon_st4m_32((__int32*)src1, src2) #define vst4_u8(src1, src2) neon_st4m_8((__int8*)src1, src2) #define vst4_s64(src1, src2) neon_st1m4_64((__int64*)src1, src2) #define vst4_u64(src1, src2) neon_st1m4_64((__int64*)src1, src2) #define vst4q_f32(src1, src2) neon_st4m_q32((__int32*)src1, src2) #define vst4q_p16(src1, src2) neon_st4m_q16((__int16*)src1, src2) #define vst4q_p8(src1, src2) neon_st4m_q8((__int8*)src1, src2) #define vst4q_s16(src1, src2) neon_st4m_q16((__int16*)src1, src2) #define vst4q_s32(src1, src2) neon_st4m_q32((__int32*)src1, src2) #define vst4q_s8(src1, src2) neon_st4m_q8((__int8*)src1, src2) #define vst4q_u16(src1, src2) neon_st4m_q16((__int16*)src1, src2) #define vst4q_u32(src1, src2) neon_st4m_q32((__int32*)src1, src2) #define vst4q_u8(src1, src2) neon_st4m_q8((__int8*)src1, src2) #define vst4q_s64(src1, src2) neon_st4m_q64((__int64*)src1, src2) #define vst4q_u64(src1, src2) neon_st4m_q64((__int64*)src1, src2) #define vst4_lane_f32(src1, src2, src3) neon_st4s_32((__int32*)src1, src2, src3) #define vst4_lane_p16(src1, src2, src3) neon_st4s_16((__int16*)src1, src2, src3) #define vst4_lane_p8(src1, src2, src3) neon_st4s_8((__int8*)src1, src2, src3) #define vst4_lane_s16(src1, src2, src3) neon_st4s_16((__int16*)src1, src2, src3) #define vst4_lane_s32(src1, src2, src3) neon_st4s_32((__int32*)src1, src2, src3) #define vst4_lane_s64(src1, src2, src3) neon_st4s_64((__int64*)src1, src2, src3) #define vst4_lane_s8(src1, src2, src3) neon_st4s_8((__int8*)src1, src2, src3) #define vst4_lane_u16(src1, src2, src3) neon_st4s_16((__int16*)src1, src2, src3) #define vst4_lane_u32(src1, src2, src3) neon_st4s_32((__int32*)src1, src2, src3) #define vst4_lane_u8(src1, src2, src3) neon_st4s_8((__int8*)src1, src2, src3) #define vst4q_lane_f32(src1, src2, src3) neon_st4s_q32((__int32*)src1, src2, src3) #define vst4q_lane_p8(src1, src2, src3) neon_st4s_q8((__int8*)src1, src2, src3) #define vst4q_lane_p16(src1, src2, src3) neon_st4s_q16((__int16*)src1, src2, src3) #define vst4q_lane_s16(src1, src2, src3) neon_st4s_q16((__int16*)src1, src2, src3) #define vst4q_lane_s32(src1, src2, src3) neon_st4s_q32((__int32*)src1, src2, src3) #define vst4q_lane_s64(src1, src2, src3) neon_st4s_q64((__int64*)src1, src2, src3) #define vst4q_lane_u16(src1, src2, src3) neon_st4s_q16((__int16*)src1, src2, src3) #define vst4q_lane_u32(src1, src2, src3) neon_st4s_q32((__int32*)src1, src2, src3) #define vst3_f32(src1, src2) neon_st3m_32((__int32*)src1, src2) #define vst3_p16(src1, src2) neon_st3m_16((__int16*)src1, src2) #define vst3_p8(src1, src2) neon_st3m_8((__int8*)src1, src2) #define vst3_s16(src1, src2) neon_st3m_16((__int16*)src1, src2) #define vst3_s32(src1, src2) neon_st3m_32((__int32*)src1, src2) #define vst3_s8(src1, src2) neon_st3m_8((__int8*)src1, src2) #define vst3_u16(src1, src2) neon_st3m_16((__int16*)src1, src2) #define vst3_u32(src1, src2) neon_st3m_32((__int32*)src1, src2) #define vst3_u8(src1, src2) neon_st3m_8((__int8*)src1, src2) #define vst3_s64(src1, src2) neon_st1m3_64((__int64*)src1, src2) #define vst3_u64(src1, src2) neon_st1m3_64((__int64*)src1, src2) #define vst3q_f32(src1, src2) neon_st3m_q32((__int32*)src1, src2) #define vst3q_p16(src1, src2) neon_st3m_q16((__int16*)src1, src2) #define vst3q_p8(src1, src2) neon_st3m_q8((__int8*)src1, src2) #define vst3q_s16(src1, src2) neon_st3m_q16((__int16*)src1, src2) #define vst3q_s32(src1, src2) neon_st3m_q32((__int32*)src1, src2) #define vst3q_s8(src1, src2) neon_st3m_q8((__int8*)src1, src2) #define vst3q_u16(src1, src2) neon_st3m_q16((__int16*)src1, src2) #define vst3q_u32(src1, src2) neon_st3m_q32((__int32*)src1, src2) #define vst3q_u8(src1, src2) neon_st3m_q8((__int8*)src1, src2) #define vst3q_s64(src1, src2) neon_st3m_q64((__int64*)src1, src2) #define vst3q_u64(src1, src2) neon_st3m_q64((__int64*)src1, src2) #define vst3_lane_f32(src1, src2, src3) neon_st3s_32((__int32*)src1, src2, src3) #define vst3_lane_p16(src1, src2, src3) neon_st3s_16((__int16*)src1, src2, src3) #define vst3_lane_p8(src1, src2, src3) neon_st3s_8((__int8*)src1, src2, src3) #define vst3_lane_s16(src1, src2, src3) neon_st3s_16((__int16*)src1, src2, src3) #define vst3_lane_s32(src1, src2, src3) neon_st3s_32((__int32*)src1, src2, src3) #define vst3_lane_s64(src1, src2, src3) neon_st3s_64((__int64*)src1, src2, src3) #define vst3_lane_s8(src1, src2, src3) neon_st3s_8((__int8*)src1, src2, src3) #define vst3_lane_u16(src1, src2, src3) neon_st3s_16((__int16*)src1, src2, src3) #define vst3_lane_u32(src1, src2, src3) neon_st3s_32((__int32*)src1, src2, src3) #define vst3_lane_u8(src1, src2, src3) neon_st3s_8((__int8*)src1, src2, src3) #define vst3q_lane_f32(src1, src2, src3) neon_st3s_q32((__int32*)src1, src2, src3) #define vst3q_lane_p8(src1, src2, src3) neon_st3s_q8((__int8*)src1, src2, src3) #define vst3q_lane_p16(src1, src2, src3) neon_st3s_q16((__int16*)src1, src2, src3) #define vst3q_lane_s16(src1, src2, src3) neon_st3s_q16((__int16*)src1, src2, src3) #define vst3q_lane_s32(src1, src2, src3) neon_st3s_q32((__int32*)src1, src2, src3) #define vst3q_lane_s64(src1, src2, src3) neon_st3s_q64((__int64*)src1, src2, src3) #define vst3q_lane_u16(src1, src2, src3) neon_st3s_q16((__int16*)src1, src2, src3) #define vst3q_lane_u32(src1, src2, src3) neon_st3s_q32((__int32*)src1, src2, src3) #define vst2_f32(src1, src2) neon_st2m_32((__int32*)src1, src2) #define vst2_p16(src1, src2) neon_st2m_16((__int16*)src1, src2) #define vst2_p8(src1, src2) neon_st2m_8((__int8*)src1, src2) #define vst2_s16(src1, src2) neon_st2m_16((__int16*)src1, src2) #define vst2_s32(src1, src2) neon_st2m_32((__int32*)src1, src2) #define vst2_s8(src1, src2) neon_st2m_8((__int8*)src1, src2) #define vst2_u16(src1, src2) neon_st2m_16((__int16*)src1, src2) #define vst2_u32(src1, src2) neon_st2m_32((__int32*)src1, src2) #define vst2_u8(src1, src2) neon_st2m_8((__int8*)src1, src2) #define vst2_s64(src1, src2) neon_st1m2_64((__int64*)src1, src2) #define vst2_u64(src1, src2) neon_st1m2_64((__int64*)src1, src2) #define vst2q_f32(src1, src2) neon_st2m_q32((__int32*)src1, src2) #define vst2q_p16(src1, src2) neon_st2m_q16((__int16*)src1, src2) #define vst2q_p8(src1, src2) neon_st2m_q8((__int8*)src1, src2) #define vst2q_s16(src1, src2) neon_st2m_q16((__int16*)src1, src2) #define vst2q_s32(src1, src2) neon_st2m_q32((__int32*)src1, src2) #define vst2q_s8(src1, src2) neon_st2m_q8((__int8*)src1, src2) #define vst2q_u16(src1, src2) neon_st2m_q16((__int16*)src1, src2) #define vst2q_u32(src1, src2) neon_st2m_q32((__int32*)src1, src2) #define vst2q_u8(src1, src2) neon_st2m_q8((__int8*)src1, src2) #define vst2q_s64(src1, src2) neon_st2m_q64((__int64*)src1, src2) #define vst2q_u64(src1, src2) neon_st2m_q64((__int64*)src1, src2) #define vst2_lane_f32(src1, src2, src3) neon_st2s_32((__int32*)src1, src2, src3) #define vst2_lane_p16(src1, src2, src3) neon_st2s_16((__int16*)src1, src2, src3) #define vst2_lane_p8(src1, src2, src3) neon_st2s_8((__int8*)src1, src2, src3) #define vst2_lane_s16(src1, src2, src3) neon_st2s_16((__int16*)src1, src2, src3) #define vst2_lane_s32(src1, src2, src3) neon_st2s_32((__int32*)src1, src2, src3) #define vst2_lane_s64(src1, src2, src3) neon_st2s_64((__int64*)src1, src2, src3) #define vst2_lane_s8(src1, src2, src3) neon_st2s_8((__int8*)src1, src2, src3) #define vst2_lane_u16(src1, src2, src3) neon_st2s_16((__int16*)src1, src2, src3) #define vst2_lane_u32(src1, src2, src3) neon_st2s_32((__int32*)src1, src2, src3) #define vst2_lane_u8(src1, src2, src3) neon_st2s_8((__int8*)src1, src2, src3) #define vst2q_lane_f32(src1, src2, src3) neon_st2s_q32((__int32*)src1, src2, src3) #define vst2q_lane_p8(src1, src2, src3) neon_st2s_q8((__int8*)src1, src2, src3) #define vst2q_lane_p16(src1, src2, src3) neon_st2s_q16((__int16*)src1, src2, src3) #define vst2q_lane_s16(src1, src2, src3) neon_st2s_q16((__int16*)src1, src2, src3) #define vst2q_lane_s32(src1, src2, src3) neon_st2s_q32((__int32*)src1, src2, src3) #define vst2q_lane_s64(src1, src2, src3) neon_st2s_q64((__int64*)src1, src2, src3) #define vst2q_lane_u16(src1, src2, src3) neon_st2s_q16((__int16*)src1, src2, src3) #define vst2q_lane_u32(src1, src2, src3) neon_st2s_q32((__int32*)src1, src2, src3) #define vst1_f32(src1, src2) neon_st1m_32((__int32*)src1, src2) #define vst1_p16(src1, src2) neon_st1m_16((__int16*)src1, src2) #define vst1_p8(src1, src2) neon_st1m_8((__int8*)src1, src2) #define vst1_s16(src1, src2) neon_st1m_16((__int16*)src1, src2) #define vst1_s32(src1, src2) neon_st1m_32((__int32*)src1, src2) #define vst1_s8(src1, src2) neon_st1m_8((__int8*)src1, src2) #define vst1_u16(src1, src2) neon_st1m_16((__int16*)src1, src2) #define vst1_u32(src1, src2) neon_st1m_32((__int32*)src1, src2) #define vst1_u8(src1, src2) neon_st1m_8((__int8*)src1, src2) #define vst1_s64(src1, src2) neon_st1m_64((__int64*)src1, src2) #define vst1_u64(src1, src2) neon_st1m_64((__int64*)src1, src2) #define vst1q_f32(src1, src2) neon_st1m_q32((__int32*)src1, src2) #define vst1q_p16(src1, src2) neon_st1m_q16((__int16*)src1, src2) #define vst1q_p8(src1, src2) neon_st1m_q8((__int8*)src1, src2) #define vst1q_s16(src1, src2) neon_st1m_q16((__int16*)src1, src2) #define vst1q_s32(src1, src2) neon_st1m_q32((__int32*)src1, src2) #define vst1q_s8(src1, src2) neon_st1m_q8((__int8*)src1, src2) #define vst1q_u16(src1, src2) neon_st1m_q16((__int16*)src1, src2) #define vst1q_u32(src1, src2) neon_st1m_q32((__int32*)src1, src2) #define vst1q_u8(src1, src2) neon_st1m_q8((__int8*)src1, src2) #define vst1q_s64(src1, src2) neon_st1m_q64((__int64*)src1, src2) #define vst1q_u64(src1, src2) neon_st1m_q64((__int64*)src1, src2) #define vst1_f32_x2(src1, src2) neon_st1m2_32((__int32*)src1, src2) #define vst1_p16_x2(src1, src2) neon_st1m2_16((__int16*)src1, src2) #define vst1_p8_x2(src1, src2) neon_st1m2_8((__int8*)src1, src2) #define vst1_s16_x2(src1, src2) neon_st1m2_16((__int16*)src1, src2) #define vst1_s32_x2(src1, src2) neon_st1m2_32((__int32*)src1, src2) #define vst1_s8_x2(src1, src2) neon_st1m2_8((__int8*)src1, src2) #define vst1_u16_x2(src1, src2) neon_st1m2_16((__int16*)src1, src2) #define vst1_u32_x2(src1, src2) neon_st1m2_32((__int32*)src1, src2) #define vst1_u8_x2(src1, src2) neon_st1m2_8((__int8*)src1, src2) #define vst1_s64_x2(src1, src2) neon_st1m2_64((__int64*)src1, src2) #define vst1_u64_x2(src1, src2) neon_st1m2_64((__int64*)src1, src2) #define vst1q_f32_x2(src1, src2) neon_st1m2_q32((__int32*)src1, src2) #define vst1q_p16_x2(src1, src2) neon_st1m2_q16((__int16*)src1, src2) #define vst1q_p8_x2(src1, src2) neon_st1m2_q8((__int8*)src1, src2) #define vst1q_s16_x2(src1, src2) neon_st1m2_q16((__int16*)src1, src2) #define vst1q_s32_x2(src1, src2) neon_st1m2_q32((__int32*)src1, src2) #define vst1q_s8_x2(src1, src2) neon_st1m2_q8((__int8*)src1, src2) #define vst1q_u16_x2(src1, src2) neon_st1m2_q16((__int16*)src1, src2) #define vst1q_u32_x2(src1, src2) neon_st1m2_q32((__int32*)src1, src2) #define vst1q_u8_x2(src1, src2) neon_st1m2_q8((__int8*)src1, src2) #define vst1q_s64_x2(src1, src2) neon_st1m2_q64((__int64*)src1, src2) #define vst1q_u64_x2(src1, src2) neon_st1m2_q64((__int64*)src1, src2) #define vst1_f32_x3(src1, src2) neon_st1m3_32((__int32*)src1, src2) #define vst1_p16_x3(src1, src2) neon_st1m3_16((__int16*)src1, src2) #define vst1_p8_x3(src1, src2) neon_st1m3_8((__int8*)src1, src2) #define vst1_s16_x3(src1, src2) neon_st1m3_16((__int16*)src1, src2) #define vst1_s32_x3(src1, src2) neon_st1m3_32((__int32*)src1, src2) #define vst1_s8_x3(src1, src2) neon_st1m3_8((__int8*)src1, src2) #define vst1_u16_x3(src1, src2) neon_st1m3_16((__int16*)src1, src2) #define vst1_u32_x3(src1, src2) neon_st1m3_32((__int32*)src1, src2) #define vst1_u8_x3(src1, src2) neon_st1m3_8((__int8*)src1, src2) #define vst1_s64_x3(src1, src2) neon_st1m3_64((__int64*)src1, src2) #define vst1_u64_x3(src1, src2) neon_st1m3_64((__int64*)src1, src2) #define vst1q_f32_x3(src1, src2) neon_st1m3_q32((__int32*)src1, src2) #define vst1q_p16_x3(src1, src2) neon_st1m3_q16((__int16*)src1, src2) #define vst1q_p8_x3(src1, src2) neon_st1m3_q8((__int8*)src1, src2) #define vst1q_s16_x3(src1, src2) neon_st1m3_q16((__int16*)src1, src2) #define vst1q_s32_x3(src1, src2) neon_st1m3_q32((__int32*)src1, src2) #define vst1q_s8_x3(src1, src2) neon_st1m3_q8((__int8*)src1, src2) #define vst1q_u16_x3(src1, src2) neon_st1m3_q16((__int16*)src1, src2) #define vst1q_u32_x3(src1, src2) neon_st1m3_q32((__int32*)src1, src2) #define vst1q_u8_x3(src1, src2) neon_st1m3_q8((__int8*)src1, src2) #define vst1q_s64_x3(src1, src2) neon_st1m3_q64((__int64*)src1, src2) #define vst1q_u64_x3(src1, src2) neon_st1m3_q64((__int64*)src1, src2) #define vst1_f32_x4(src1, src2) neon_st1m4_32((__int32*)src1, src2) #define vst1_p16_x4(src1, src2) neon_st1m4_16((__int16*)src1, src2) #define vst1_p8_x4(src1, src2) neon_st1m4_8((__int8*)src1, src2) #define vst1_s16_x4(src1, src2) neon_st1m4_16((__int16*)src1, src2) #define vst1_s32_x4(src1, src2) neon_st1m4_32((__int32*)src1, src2) #define vst1_s8_x4(src1, src2) neon_st1m4_8((__int8*)src1, src2) #define vst1_u16_x4(src1, src2) neon_st1m4_16((__int16*)src1, src2) #define vst1_u32_x4(src1, src2) neon_st1m4_32((__int32*)src1, src2) #define vst1_u8_x4(src1, src2) neon_st1m4_8((__int8*)src1, src2) #define vst1_s64_x4(src1, src2) neon_st1m4_64((__int64*)src1, src2) #define vst1_u64_x4(src1, src2) neon_st1m4_64((__int64*)src1, src2) #define vst1q_f32_x4(src1, src2) neon_st1m4_q32((__int32*)src1, src2) #define vst1q_p16_x4(src1, src2) neon_st1m4_q16((__int16*)src1, src2) #define vst1q_p8_x4(src1, src2) neon_st1m4_q8((__int8*)src1, src2) #define vst1q_s16_x4(src1, src2) neon_st1m4_q16((__int16*)src1, src2) #define vst1q_s32_x4(src1, src2) neon_st1m4_q32((__int32*)src1, src2) #define vst1q_s8_x4(src1, src2) neon_st1m4_q8((__int8*)src1, src2) #define vst1q_u16_x4(src1, src2) neon_st1m4_q16((__int16*)src1, src2) #define vst1q_u32_x4(src1, src2) neon_st1m4_q32((__int32*)src1, src2) #define vst1q_u8_x4(src1, src2) neon_st1m4_q8((__int8*)src1, src2) #define vst1q_s64_x4(src1, src2) neon_st1m4_q64((__int64*)src1, src2) #define vst1q_u64_x4(src1, src2) neon_st1m4_q64((__int64*)src1, src2) #define vst1_lane_f32(src1, src2, src3) neon_st1s_32((__int32*)src1, src2, src3) #define vst1_lane_p16(src1, src2, src3) neon_st1s_16((__int16*)src1, src2, src3) #define vst1_lane_p8(src1, src2, src3) neon_st1s_8((__int8*)src1, src2, src3) #define vst1q_lane_s8(src1, src2, src3) neon_st1s_q8((__int16*)src1, src2, src3) #define vst1_lane_s16(src1, src2, src3) neon_st1s_16((__int16*)src1, src2, src3) #define vst1_lane_s32(src1, src2, src3) neon_st1s_32((__int32*)src1, src2, src3) #define vst1_lane_s64(src1, src2, src3) neon_st1s_64((__int64*)src1, src2, src3) #define vst1_lane_s8(src1, src2, src3) neon_st1s_8((__int8*)src1, src2, src3) #define vst1_lane_u16(src1, src2, src3) neon_st1s_16((__int16*)src1, src2, src3) #define vst1_lane_u32(src1, src2, src3) neon_st1s_32((__int32*)src1, src2, src3) #define vst1_lane_u8(src1, src2, src3) neon_st1s_8((__int8*)src1, src2, src3) #define vst1q_lane_f32(src1, src2, src3) neon_st1s_q32((__int32*)src1, src2, src3) #define vst1q_lane_p8(src1, src2, src3) neon_st1s_q8((__int8*)src1, src2, src3) #define vst1q_lane_p16(src1, src2, src3) neon_st1s_q16((__int16*)src1, src2, src3) #define vst1q_lane_s16(src1, src2, src3) neon_st1s_q16((__int16*)src1, src2, src3) #define vst1q_lane_s32(src1, src2, src3) neon_st1s_q32((__int32*)src1, src2, src3) #define vst1q_lane_s64(src1, src2, src3) neon_st1s_q64((__int64*)src1, src2, src3) #define vst1q_lane_u8(src1, src2, src3) neon_st1s_q8((__int8*)src1, src2, src3) #define vst1q_lane_u16(src1, src2, src3) neon_st1s_q16((__int16*)src1, src2, src3) #define vst1q_lane_u32(src1, src2, src3) neon_st1s_q32((__int32*)src1, src2, src3) #define vst4_f32_ex(src1, src2, align) neon_st4m_32((__int32*)src1, src2) #define vst4_p16_ex(src1, src2, align) neon_st4m_16((__int16*)src1, src2) #define vst4_p8_ex(src1, src2, align) neon_st4m_8((__int8*)src1, src2) #define vst4_s16_ex(src1, src2, align) neon_st4m_16((__int16*)src1, src2) #define vst4_s32_ex(src1, src2, align) neon_st4m_32((__int32*)src1, src2) #define vst4_s8_ex(src1, src2, align) neon_st4m_8((__int8*)src1, src2) #define vst4_u16_ex(src1, src2, align) neon_st4m_16((__int16*)src1, src2) #define vst4_u32_ex(src1, src2, align) neon_st4m_32((__int32*)src1, src2) #define vst4_u8_ex(src1, src2, align) neon_st4m_8((__int8*)src1, src2) #define vst4_s64_ex(src1, src2, align) neon_st1m4_64((__int64*)src1, src2) #define vst4_u64_ex(src1, src2, align) neon_st1m4_64((__int64*)src1, src2) #define vst4q_f32_ex(src1, src2, align) neon_st4m_q32((__int32*)src1, src2) #define vst4q_p16_ex(src1, src2, align) neon_st4m_q16((__int16*)src1, src2) #define vst4q_p8_ex(src1, src2, align) neon_st4m_q8((__int8*)src1, src2) #define vst4q_s16_ex(src1, src2, align) neon_st4m_q16((__int16*)src1, src2) #define vst4q_s32_ex(src1, src2, align) neon_st4m_q32((__int32*)src1, src2) #define vst4q_s8_ex(src1, src2, align) neon_st4m_q8((__int8*)src1, src2) #define vst4q_u16_ex(src1, src2, align) neon_st4m_q16((__int16*)src1, src2) #define vst4q_u32_ex(src1, src2, align) neon_st4m_q32((__int32*)src1, src2) #define vst4q_u8_ex(src1, src2, align) neon_st4m_q8((__int8*)src1, src2) #define vst4q_s64_ex(src1, src2, align) neon_st4m_q64((__int64*)src1, src2) #define vst4q_u64_ex(src1, src2, align) neon_st4m_q64((__int64*)src1, src2) #define vst4_lane_f32_ex(src1, src2, src3, align) neon_st4s_32((__int32*)src1, src2, src3) #define vst4_lane_p16_ex(src1, src2, src3, align) neon_st4s_16((__int16*)src1, src2, src3) #define vst4_lane_p8_ex(src1, src2, src3, align) neon_st4s_8((__int8*)src1, src2, src3) #define vst4_lane_s16_ex(src1, src2, src3, align) neon_st4s_16((__int16*)src1, src2, src3) #define vst4_lane_s32_ex(src1, src2, src3, align) neon_st4s_32((__int32*)src1, src2, src3) #define vst4_lane_s64_ex(src1, src2, src3, align) neon_st4s_64((__int64*)src1, src2, src3) #define vst4_lane_s8_ex(src1, src2, src3, align) neon_st4s_8((__int8*)src1, src2, src3) #define vst4_lane_u16_ex(src1, src2, src3, align) neon_st4s_16((__int16*)src1, src2, src3) #define vst4_lane_u32_ex(src1, src2, src3, align) neon_st4s_32((__int32*)src1, src2, src3) #define vst4_lane_u8_ex(src1, src2, src3, align) neon_st4s_8((__int8*)src1, src2, src3) #define vst4q_lane_f32_ex(src1, src2, src3, align) neon_st4s_q32((__int32*)src1, src2, src3) #define vst4q_lane_p8_ex(src1, src2, src3, align) neon_st4s_q8((__int8*)src1, src2, src3) #define vst4q_lane_p16_ex(src1, src2, src3, align) neon_st4s_q16((__int16*)src1, src2, src3) #define vst4q_lane_s16_ex(src1, src2, src3, align) neon_st4s_q16((__int16*)src1, src2, src3) #define vst4q_lane_s32_ex(src1, src2, src3, align) neon_st4s_q32((__int32*)src1, src2, src3) #define vst4q_lane_s64_ex(src1, src2, src3, align) neon_st4s_q64((__int64*)src1, src2, src3) #define vst4q_lane_u16_ex(src1, src2, src3, align) neon_st4s_q16((__int16*)src1, src2, src3) #define vst4q_lane_u32_ex(src1, src2, src3, align) neon_st4s_q32((__int32*)src1, src2, src3) #define vst3_f32_ex(src1, src2, align) neon_st3m_32((__int32*)src1, src2) #define vst3_p16_ex(src1, src2, align) neon_st3m_16((__int16*)src1, src2) #define vst3_p8_ex(src1, src2, align) neon_st3m_8((__int8*)src1, src2) #define vst3_s16_ex(src1, src2, align) neon_st3m_16((__int16*)src1, src2) #define vst3_s32_ex(src1, src2, align) neon_st3m_32((__int32*)src1, src2) #define vst3_s8_ex(src1, src2, align) neon_st3m_8((__int8*)src1, src2) #define vst3_u16_ex(src1, src2, align) neon_st3m_16((__int16*)src1, src2) #define vst3_u32_ex(src1, src2, align) neon_st3m_32((__int32*)src1, src2) #define vst3_u8_ex(src1, src2, align) neon_st3m_8((__int8*)src1, src2) #define vst3_s64_ex(src1, src2, align) neon_st1m3_64((__int64*)src1, src2) #define vst3_u64_ex(src1, src2, align) neon_st1m3_64((__int64*)src1, src2) #define vst3q_f32_ex(src1, src2, align) neon_st3m_q32((__int32*)src1, src2) #define vst3q_p16_ex(src1, src2, align) neon_st3m_q16((__int16*)src1, src2) #define vst3q_p8_ex(src1, src2, align) neon_st3m_q8((__int8*)src1, src2) #define vst3q_s16_ex(src1, src2, align) neon_st3m_q16((__int16*)src1, src2) #define vst3q_s32_ex(src1, src2, align) neon_st3m_q32((__int32*)src1, src2) #define vst3q_s8_ex(src1, src2, align) neon_st3m_q8((__int8*)src1, src2) #define vst3q_u16_ex(src1, src2, align) neon_st3m_q16((__int16*)src1, src2) #define vst3q_u32_ex(src1, src2, align) neon_st3m_q32((__int32*)src1, src2) #define vst3q_u8_ex(src1, src2, align) neon_st3m_q8((__int8*)src1, src2) #define vst3q_s64_ex(src1, src2, align) neon_st3m_q64((__int64*)src1, src2) #define vst3q_u64_ex(src1, src2, align) neon_st3m_q64((__int64*)src1, src2) #define vst3_lane_f32_ex(src1, src2, src3, align) neon_st3s_32((__int32*)src1, src2, src3) #define vst3_lane_p16_ex(src1, src2, src3, align) neon_st3s_16((__int16*)src1, src2, src3) #define vst3_lane_p8_ex(src1, src2, src3, align) neon_st3s_8((__int8*)src1, src2, src3) #define vst3_lane_s16_ex(src1, src2, src3, align) neon_st3s_16((__int16*)src1, src2, src3) #define vst3_lane_s32_ex(src1, src2, src3, align) neon_st3s_32((__int32*)src1, src2, src3) #define vst3_lane_s64_ex(src1, src2, src3, align) neon_st3s_64((__int64*)src1, src2, src3) #define vst3_lane_s8_ex(src1, src2, src3, align) neon_st3s_8((__int8*)src1, src2, src3) #define vst3_lane_u16_ex(src1, src2, src3, align) neon_st3s_16((__int16*)src1, src2, src3) #define vst3_lane_u32_ex(src1, src2, src3, align) neon_st3s_32((__int32*)src1, src2, src3) #define vst3_lane_u8_ex(src1, src2, src3, align) neon_st3s_8((__int8*)src1, src2, src3) #define vst3q_lane_f32_ex(src1, src2, src3, align) neon_st3s_q32((__int32*)src1, src2, src3) #define vst3q_lane_p8_ex(src1, src2, src3, align) neon_st3s_q8((__int8*)src1, src2, src3) #define vst3q_lane_p16_ex(src1, src2, src3, align) neon_st3s_q16((__int16*)src1, src2, src3) #define vst3q_lane_s16_ex(src1, src2, src3, align) neon_st3s_q16((__int16*)src1, src2, src3) #define vst3q_lane_s32_ex(src1, src2, src3, align) neon_st3s_q32((__int32*)src1, src2, src3) #define vst3q_lane_s64_ex(src1, src2, src3, align) neon_st3s_q64((__int64*)src1, src2, src3) #define vst3q_lane_u16_ex(src1, src2, src3, align) neon_st3s_q16((__int16*)src1, src2, src3) #define vst3q_lane_u32_ex(src1, src2, src3, align) neon_st3s_q32((__int32*)src1, src2, src3) #define vst2_f32_ex(src1, src2, align) neon_st2m_32((__int32*)src1, src2) #define vst2_p16_ex(src1, src2, align) neon_st2m_16((__int16*)src1, src2) #define vst2_p8_ex(src1, src2, align) neon_st2m_8((__int8*)src1, src2) #define vst2_s16_ex(src1, src2, align) neon_st2m_16((__int16*)src1, src2) #define vst2_s32_ex(src1, src2, align) neon_st2m_32((__int32*)src1, src2) #define vst2_s8_ex(src1, src2, align) neon_st2m_8((__int8*)src1, src2) #define vst2_u16_ex(src1, src2, align) neon_st2m_16((__int16*)src1, src2) #define vst2_u32_ex(src1, src2, align) neon_st2m_32((__int32*)src1, src2) #define vst2_u8_ex(src1, src2, align) neon_st2m_8((__int8*)src1, src2) #define vst2_s64_ex(src1, src2, align) neon_st1m2_64((__int64*)src1, src2) #define vst2_u64_ex(src1, src2, align) neon_st1m2_64((__int64*)src1, src2) #define vst2q_f32_ex(src1, src2, align) neon_st2m_q32((__int32*)src1, src2) #define vst2q_p16_ex(src1, src2, align) neon_st2m_q16((__int16*)src1, src2) #define vst2q_p8_ex(src1, src2, align) neon_st2m_q8((__int8*)src1, src2) #define vst2q_s16_ex(src1, src2, align) neon_st2m_q16((__int16*)src1, src2) #define vst2q_s32_ex(src1, src2, align) neon_st2m_q32((__int32*)src1, src2) #define vst2q_s8_ex(src1, src2, align) neon_st2m_q8((__int8*)src1, src2) #define vst2q_u16_ex(src1, src2, align) neon_st2m_q16((__int16*)src1, src2) #define vst2q_u32_ex(src1, src2, align) neon_st2m_q32((__int32*)src1, src2) #define vst2q_u8_ex(src1, src2, align) neon_st2m_q8((__int8*)src1, src2) #define vst2q_s64_ex(src1, src2, align) neon_st2m_q64((__int64*)src1, src2) #define vst2q_u64_ex(src1, src2, align) neon_st2m_q64((__int64*)src1, src2) #define vst2_lane_f32_ex(src1, src2, src3, align) neon_st2s_32((__int32*)src1, src2, src3) #define vst2_lane_p16_ex(src1, src2, src3, align) neon_st2s_16((__int16*)src1, src2, src3) #define vst2_lane_p8_ex(src1, src2, src3, align) neon_st2s_8((__int8*)src1, src2, src3) #define vst2_lane_s16_ex(src1, src2, src3, align) neon_st2s_16((__int16*)src1, src2, src3) #define vst2_lane_s32_ex(src1, src2, src3, align) neon_st2s_32((__int32*)src1, src2, src3) #define vst2_lane_s64_ex(src1, src2, src3, align) neon_st2s_64((__int64*)src1, src2, src3) #define vst2_lane_s8_ex(src1, src2, src3, align) neon_st2s_8((__int8*)src1, src2, src3) #define vst2_lane_u16_ex(src1, src2, src3, align) neon_st2s_16((__int16*)src1, src2, src3) #define vst2_lane_u32_ex(src1, src2, src3, align) neon_st2s_32((__int32*)src1, src2, src3) #define vst2_lane_u8_ex(src1, src2, src3, align) neon_st2s_8((__int8*)src1, src2, src3) #define vst2q_lane_f32_ex(src1, src2, src3, align) neon_st2s_q32((__int32*)src1, src2, src3) #define vst2q_lane_p8_ex(src1, src2, src3, align) neon_st2s_q8((__int8*)src1, src2, src3) #define vst2q_lane_p16_ex(src1, src2, src3, align) neon_st2s_q16((__int16*)src1, src2, src3) #define vst2q_lane_s16_ex(src1, src2, src3, align) neon_st2s_q16((__int16*)src1, src2, src3) #define vst2q_lane_s32_ex(src1, src2, src3, align) neon_st2s_q32((__int32*)src1, src2, src3) #define vst2q_lane_s64_ex(src1, src2, src3, align) neon_st2s_q64((__int64*)src1, src2, src3) #define vst2q_lane_u16_ex(src1, src2, src3, align) neon_st2s_q16((__int16*)src1, src2, src3) #define vst2q_lane_u32_ex(src1, src2, src3, align) neon_st2s_q32((__int32*)src1, src2, src3) #define vst1_f32_ex(src1, src2, align) neon_st1m_32((__int32*)src1, src2) #define vst1_p16_ex(src1, src2, align) neon_st1m_16((__int16*)src1, src2) #define vst1_p8_ex(src1, src2, align) neon_st1m_8((__int8*)src1, src2) #define vst1_s16_ex(src1, src2, align) neon_st1m_16((__int16*)src1, src2) #define vst1_s32_ex(src1, src2, align) neon_st1m_32((__int32*)src1, src2) #define vst1_s8_ex(src1, src2, align) neon_st1m_8((__int8*)src1, src2) #define vst1_u16_ex(src1, src2, align) neon_st1m_16((__int16*)src1, src2) #define vst1_u32_ex(src1, src2, align) neon_st1m_32((__int32*)src1, src2) #define vst1_u8_ex(src1, src2, align) neon_st1m_8((__int8*)src1, src2) #define vst1_s64_ex(src1, src2, align) neon_st1m_64((__int64*)src1, src2) #define vst1_u64_ex(src1, src2, align) neon_st1m_64((__int64*)src1, src2) #define vst1q_f32_ex(src1, src2, align) neon_st1m_q32((__int32*)src1, src2) #define vst1q_p16_ex(src1, src2, align) neon_st1m_q16((__int16*)src1, src2) #define vst1q_p8_ex(src1, src2, align) neon_st1m_q8((__int8*)src1, src2) #define vst1q_s16_ex(src1, src2, align) neon_st1m_q16((__int16*)src1, src2) #define vst1q_s32_ex(src1, src2, align) neon_st1m_q32((__int32*)src1, src2) #define vst1q_s8_ex(src1, src2, align) neon_st1m_q8((__int8*)src1, src2) #define vst1q_u16_ex(src1, src2, align) neon_st1m_q16((__int16*)src1, src2) #define vst1q_u32_ex(src1, src2, align) neon_st1m_q32((__int32*)src1, src2) #define vst1q_u8_ex(src1, src2, align) neon_st1m_q8((__int8*)src1, src2) #define vst1q_s64_ex(src1, src2, align) neon_st1m_q64((__int64*)src1, src2) #define vst1q_u64_ex(src1, src2, align) neon_st1m_q64((__int64*)src1, src2) #define vst1_f32_x2_ex(src1, src2, align) neon_st1m2_32((__int32*)src1, src2) #define vst1_p16_x2_ex(src1, src2, align) neon_st1m2_16((__int16*)src1, src2) #define vst1_p8_x2_ex(src1, src2, align) neon_st1m2_8((__int8*)src1, src2) #define vst1_s16_x2_ex(src1, src2, align) neon_st1m2_16((__int16*)src1, src2) #define vst1_s32_x2_ex(src1, src2, align) neon_st1m2_32((__int32*)src1, src2) #define vst1_s8_x2_ex(src1, src2, align) neon_st1m2_8((__int8*)src1, src2) #define vst1_u16_x2_ex(src1, src2, align) neon_st1m2_16((__int16*)src1, src2) #define vst1_u32_x2_ex(src1, src2, align) neon_st1m2_32((__int32*)src1, src2) #define vst1_u8_x2_ex(src1, src2, align) neon_st1m2_8((__int8*)src1, src2) #define vst1_s64_x2_ex(src1, src2, align) neon_st1m2_64((__int64*)src1, src2) #define vst1_u64_x2_ex(src1, src2, align) neon_st1m2_64((__int64*)src1, src2) #define vst1q_f32_x2_ex(src1, src2, align) neon_st1m2_q32((__int32*)src1, src2) #define vst1q_p16_x2_ex(src1, src2, align) neon_st1m2_q16((__int16*)src1, src2) #define vst1q_p8_x2_ex(src1, src2, align) neon_st1m2_q8((__int8*)src1, src2) #define vst1q_s16_x2_ex(src1, src2, align) neon_st1m2_q16((__int16*)src1, src2) #define vst1q_s32_x2_ex(src1, src2, align) neon_st1m2_q32((__int32*)src1, src2) #define vst1q_s8_x2_ex(src1, src2, align) neon_st1m2_q8((__int8*)src1, src2) #define vst1q_u16_x2_ex(src1, src2, align) neon_st1m2_q16((__int16*)src1, src2) #define vst1q_u32_x2_ex(src1, src2, align) neon_st1m2_q32((__int32*)src1, src2) #define vst1q_u8_x2_ex(src1, src2, align) neon_st1m2_q8((__int8*)src1, src2) #define vst1q_s64_x2_ex(src1, src2, align) neon_st1m2_q64((__int64*)src1, src2) #define vst1q_u64_x2_ex(src1, src2, align) neon_st1m2_q64((__int64*)src1, src2) #define vst1_f32_x3_ex(src1, src2, align) neon_st1m3_32((__int32*)src1, src2) #define vst1_p16_x3_ex(src1, src2, align) neon_st1m3_16((__int16*)src1, src2) #define vst1_p8_x3_ex(src1, src2, align) neon_st1m3_8((__int8*)src1, src2) #define vst1_s16_x3_ex(src1, src2, align) neon_st1m3_16((__int16*)src1, src2) #define vst1_s32_x3_ex(src1, src2, align) neon_st1m3_32((__int32*)src1, src2) #define vst1_s8_x3_ex(src1, src2, align) neon_st1m3_8((__int8*)src1, src2) #define vst1_u16_x3_ex(src1, src2, align) neon_st1m3_16((__int16*)src1, src2) #define vst1_u32_x3_ex(src1, src2, align) neon_st1m3_32((__int32*)src1, src2) #define vst1_u8_x3_ex(src1, src2, align) neon_st1m3_8((__int8*)src1, src2) #define vst1_s64_x3_ex(src1, src2, align) neon_st1m3_64((__int64*)src1, src2) #define vst1_u64_x3_ex(src1, src2, align) neon_st1m3_64((__int64*)src1, src2) #define vst1q_f32_x3_ex(src1, src2, align) neon_st1m3_q32((__int32*)src1, src2) #define vst1q_p16_x3_ex(src1, src2, align) neon_st1m3_q16((__int16*)src1, src2) #define vst1q_p8_x3_ex(src1, src2, align) neon_st1m3_q8((__int8*)src1, src2) #define vst1q_s16_x3_ex(src1, src2, align) neon_st1m3_q16((__int16*)src1, src2) #define vst1q_s32_x3_ex(src1, src2, align) neon_st1m3_q32((__int32*)src1, src2) #define vst1q_s8_x3_ex(src1, src2, align) neon_st1m3_q8((__int8*)src1, src2) #define vst1q_u16_x3_ex(src1, src2, align) neon_st1m3_q16((__int16*)src1, src2) #define vst1q_u32_x3_ex(src1, src2, align) neon_st1m3_q32((__int32*)src1, src2) #define vst1q_u8_x3_ex(src1, src2, align) neon_st1m3_q8((__int8*)src1, src2) #define vst1q_s64_x3_ex(src1, src2, align) neon_st1m3_q64((__int64*)src1, src2) #define vst1q_u64_x3_ex(src1, src2, align) neon_st1m3_q64((__int64*)src1, src2) #define vst1_f32_x4_ex(src1, src2, align) neon_st1m4_32((__int32*)src1, src2) #define vst1_p16_x4_ex(src1, src2, align) neon_st1m4_16((__int16*)src1, src2) #define vst1_p8_x4_ex(src1, src2, align) neon_st1m4_8((__int8*)src1, src2) #define vst1_s16_x4_ex(src1, src2, align) neon_st1m4_16((__int16*)src1, src2) #define vst1_s32_x4_ex(src1, src2, align) neon_st1m4_32((__int32*)src1, src2) #define vst1_s8_x4_ex(src1, src2, align) neon_st1m4_8((__int8*)src1, src2) #define vst1_u16_x4_ex(src1, src2, align) neon_st1m4_16((__int16*)src1, src2) #define vst1_u32_x4_ex(src1, src2, align) neon_st1m4_32((__int32*)src1, src2) #define vst1_u8_x4_ex(src1, src2, align) neon_st1m4_8((__int8*)src1, src2) #define vst1_s64_x4_ex(src1, src2, align) neon_st1m4_64((__int64*)src1, src2) #define vst1_u64_x4_ex(src1, src2, align) neon_st1m4_64((__int64*)src1, src2) #define vst1q_f32_x4_ex(src1, src2, align) neon_st1m4_q32((__int32*)src1, src2) #define vst1q_p16_x4_ex(src1, src2, align) neon_st1m4_q16((__int16*)src1, src2) #define vst1q_p8_x4_ex(src1, src2, align) neon_st1m4_q8((__int8*)src1, src2) #define vst1q_s16_x4_ex(src1, src2, align) neon_st1m4_q16((__int16*)src1, src2) #define vst1q_s32_x4_ex(src1, src2, align) neon_st1m4_q32((__int32*)src1, src2) #define vst1q_s8_x4_ex(src1, src2, align) neon_st1m4_q8((__int8*)src1, src2) #define vst1q_u16_x4_ex(src1, src2, align) neon_st1m4_q16((__int16*)src1, src2) #define vst1q_u32_x4_ex(src1, src2, align) neon_st1m4_q32((__int32*)src1, src2) #define vst1q_u8_x4_ex(src1, src2, align) neon_st1m4_q8((__int8*)src1, src2) #define vst1q_s64_x4_ex(src1, src2, align) neon_st1m4_q64((__int64*)src1, src2) #define vst1q_u64_x4_ex(src1, src2, align) neon_st1m4_q64((__int64*)src1, src2) #define vst1_lane_f32_ex(src1, src2, src3, align) neon_st1s_32((__int32*)src1, src2, src3) #define vst1_lane_p16_ex(src1, src2, src3, align) neon_st1s_16((__int16*)src1, src2, src3) #define vst1_lane_p8_ex(src1, src2, src3, align) neon_st1s_8((__int8*)src1, src2, src3) #define vst1_lane_s16_ex(src1, src2, src3, align) neon_st1s_16((__int16*)src1, src2, src3) #define vst1_lane_s32_ex(src1, src2, src3, align) neon_st1s_32((__int32*)src1, src2, src3) #define vst1_lane_s64_ex(src1, src2, src3, align) neon_st1s_64((__int64*)src1, src2, src3) #define vst1_lane_s8_ex(src1, src2, src3, align) neon_st1s_8((__int8*)src1, src2, src3) #define vst1_lane_u16_ex(src1, src2, src3, align) neon_st1s_16((__int16*)src1, src2, src3) #define vst1_lane_u32_ex(src1, src2, src3, align) neon_st1s_32((__int32*)src1, src2, src3) #define vst1_lane_u8_ex(src1, src2, src3, align) neon_st1s_8((__int8*)src1, src2, src3) #define vst1q_lane_f32_ex(src1, src2, src3, align) neon_st1s_q32((__int32*)src1, src2, src3) #define vst1q_lane_p8_ex(src1, src2, src3, align) neon_st1s_q8((__int8*)src1, src2, src3) #define vst1q_lane_p16_ex(src1, src2, src3, align) neon_st1s_q16((__int16*)src1, src2, src3) #define vst1q_lane_s8_ex(src1, src2, src3, align) neon_st1s_q8((__int16*)src1, src2, src3) #define vst1q_lane_s16_ex(src1, src2, src3, align) neon_st1s_q16((__int16*)src1, src2, src3) #define vst1q_lane_s32_ex(src1, src2, src3, align) neon_st1s_q32((__int32*)src1, src2, src3) #define vst1q_lane_s64_ex(src1, src2, src3, align) neon_st1s_q64((__int64*)src1, src2, src3) #define vst1q_lane_u8_ex(src1, src2, src3, align) neon_st1s_q8((__int8*)src1, src2, src3) #define vst1q_lane_u16_ex(src1, src2, src3, align) neon_st1s_q16((__int16*)src1, src2, src3) #define vst1q_lane_u32_ex(src1, src2, src3, align) neon_st1s_q32((__int32*)src1, src2, src3) // FCVTL/FCVTL2/FCVTN/FCVTN2/FCVTXN/FCVTXN2 __n128 neon_fcvtl_32(__n64); __n128 neon_fcvtl2_32(__n128); __n128 neon_fcvtl_64(__n64); __n128 neon_fcvtl2_64(__n128); __n64 neon_fcvtn_32(__n128); __n128 neon_fcvtn2_32(__n64, __n128); __n64 neon_fcvtn_64(__n128); __n128 neon_fcvtn2_64(__n64, __n128); __n64 neon_fcvtxn_64(__n128); __n128 neon_fcvtxn2_64(__n64, __n128); float neon_fcvtxns_64(double); #define vcvt_f32_f16(src) neon_fcvtl_32(src) #define vcvt_high_f32_f16(src) neon_fcvtl2_32(src) #define vcvt_f64_f32(src) neon_fcvtl_64(src) #define vcvt_high_f64_f32(src) neon_fcvtl2_64(src) #define vcvt_f16_f32(src) neon_fcvtn_32(src) #define vcvt_high_f16_f32(src1, src2) neon_fcvtn2_32(src1, src2) #define vcvt_f32_f64(src) neon_fcvtn_64(src) #define vcvt_high_f32_f64(src1, src2) neon_fcvtn2_64(src1, src2) #define vcvtx_f32_f64(src) neon_fcvtxn_64(src) #define vcvtx_high_f32_f64(src1, src2) neon_fcvtxn2_64(src1, src2) #define vcvtxd_f32_f64(src) neon_fcvtxns_64(src) // SQXTN/SQXTUN/UQXTN/XTN __n64 neon_sqxtn_16(__n128); __n64 neon_sqxtn_32(__n128); __n64 neon_sqxtn_64(__n128); __n128 neon_sqxtn2_16(__n64, __n128); __n128 neon_sqxtn2_32(__n64, __n128); __n128 neon_sqxtn2_64(__n64, __n128); __n8 neon_sqxtns_16(__n16); __n16 neon_sqxtns_32(float); float neon_sqxtns_64(__n64); __n64 neon_sqxtun_16(__n128); __n64 neon_sqxtun_32(__n128); __n64 neon_sqxtun_64(__n128); __n128 neon_sqxtun2_16(__n64, __n128); __n128 neon_sqxtun2_32(__n64, __n128); __n128 neon_sqxtun2_64(__n64, __n128); __n8 neon_sqxtuns_16(__n16); __n16 neon_sqxtuns_32(float); float neon_sqxtuns_64(__n64); __n64 neon_uqxtn_16(__n128); __n64 neon_uqxtn_32(__n128); __n64 neon_uqxtn_64(__n128); __n128 neon_uqxtn2_16(__n64, __n128); __n128 neon_uqxtn2_32(__n64, __n128); __n128 neon_uqxtn2_64(__n64, __n128); __n8 neon_uqxtns_16(__n16); __n16 neon_uqxtns_32(float); float neon_uqxtns_64(__n64); __n64 neon_xtn_16(__n128); __n64 neon_xtn_32(__n128); __n64 neon_xtn_64(__n128); __n128 neon_xtn2_16(__n64, __n128); __n128 neon_xtn2_32(__n64, __n128); __n128 neon_xtn2_64(__n64, __n128); #define vqmovn_s16(src) neon_sqxtn_16(src) #define vqmovn_s32(src) neon_sqxtn_32(src) #define vqmovn_s64(src) neon_sqxtn_64(src) #define vqmovn_high_s16(src1, src2) neon_sqxtn2_16(src1, src2) #define vqmovn_high_s32(src1, src2) neon_sqxtn2_32(src1, src2) #define vqmovn_high_s64(src1, src2) neon_sqxtn2_64(src1, src2) #define vqmovnh_s16(src) neon_sqxtns_16(src) #define vqmovns_s32(src) neon_sqxtns_32(src) #define vqmovnd_s64(src) neon_sqxtns_64(src) #define vqmovun_s16(src) neon_sqxtun_16(src) #define vqmovun_s32(src) neon_sqxtun_32(src) #define vqmovun_s64(src) neon_sqxtun_64(src) #define vqmovun_high_s16(src1, src2) neon_sqxtun2_16(src1, src2) #define vqmovun_high_s32(src1, src2) neon_sqxtun2_32(src1, src2) #define vqmovun_high_s64(src1, src2) neon_sqxtun2_64(src1, src2) #define vqmovunh_s16(src) neon_sqxtuns_16(src) #define vqmovuns_s32(src) neon_sqxtuns_32(src) #define vqmovund_s64(src) neon_sqxtuns_64(src) #define vqmovn_u16(src) neon_uqxtn_16(src) #define vqmovn_u32(src) neon_uqxtn_32(src) #define vqmovn_u64(src) neon_uqxtn_64(src) #define vqmovn_high_u16(src1, src2) neon_uqxtn2_16(src1, src2) #define vqmovn_high_u32(src1, src2) neon_uqxtn2_32(src1, src2) #define vqmovn_high_u64(src1, src2) neon_uqxtn2_64(src1, src2) #define vqmovnh_u16(src) neon_uqxtns_16(src) #define vqmovns_u32(src) neon_uqxtns_32(src) #define vqmovnd_u64(src) neon_uqxtns_64(src) #define vmovn_s16(src) neon_xtn_16(src) #define vmovn_s32(src) neon_xtn_32(src) #define vmovn_s64(src) neon_xtn_64(src) #define vmovn_u16(src) neon_xtn_16(src) #define vmovn_u32(src) neon_xtn_32(src) #define vmovn_u64(src) neon_xtn_64(src) // SHLL/SSHLL/USHLL __n128 neon_sshll_8 (__n64, const int); __n128 neon_sshll2_8 (__n128, const int); __n128 neon_sshll_16 (__n64, const int); __n128 neon_sshll2_16(__n128, const int); __n128 neon_sshll_32 (__n64, const int); __n128 neon_sshll2_32(__n128, const int); __n128 neon_ushll_8 (__n64, const int); __n128 neon_ushll2_8 (__n128, const int); __n128 neon_ushll_16 (__n64, const int); __n128 neon_ushll2_16(__n128, const int); __n128 neon_ushll_32 (__n64, const int); __n128 neon_ushll2_32(__n128, const int); #define vshll_n_s8(src1, src2) neon_sshll_8(src1, src2) #define vshll_n_s16(src1, src2) neon_sshll_16(src1, src2) #define vshll_n_s32(src1, src2) neon_sshll_32(src1, src2) #define vshll_n_u8(src1, src2) neon_ushll_8(src1, src2) #define vshll_n_u16(src1, src2) neon_ushll_16(src1, src2) #define vshll_n_u32(src1, src2) neon_ushll2_32(src1, src2) #define vshll_high_n_s8(src1, src2) neon_sshll2_8(src1, src2) #define vshll_high_n_s16(src1, src2) neon_sshll2_16(src1, src2) #define vshll_high_n_s32(src1, src2) neon_sshll2_32(src1, src2) #define vshll_high_n_u8(src1, src2) neon_ushll2_8(src1, src2) #define vshll_high_n_u16(src1, src2) neon_ushll2_16(src1, src2) #define vshll_high_n_u32(src1, src2) neon_ushll2_32(src1, src2) #define vmovl_s8(src1) neon_sshll_8(src1, 0) #define vmovl_s16(src1) neon_sshll_16(src1, 0) #define vmovl_s32(src1) neon_sshll_32(src1, 0) #define vmovl_u8(src1) neon_ushll_8(src1, 0) #define vmovl_u16(src1) neon_ushll_16(src1, 0) #define vmovl_u32(src1) neon_ushll2_32(src1, 0) #define vmovl_high_s8(src1) neon_sshll2_8(src1, 0) #define vmovl_high_s16(src1) neon_sshll2_16(src1, 0) #define vmovl_high_s32(src1) neon_sshll2_32(src1, 0) #define vmovl_high_u8(src1) neon_ushll2_8(src1, 0) #define vmovl_high_u16(src1) neon_ushll2_16(src1, 0) #define vmovl_high_u32(src1) neon_ushll2_32(src1, 0) // SHRN/RSHRN/SQSHRN/SQRSHRN/UQSHRN/UQRSHRN/SQSHRUN/SQRSHRUN __n64 neon_shrn_16 (__n128, const int); __n128 neon_shrn2_16 (__n64, __n128, const int); __n64 neon_shrn_32 (__n128, const int); __n128 neon_shrn2_32 (__n64, __n128, const int); __n64 neon_shrn_64 (__n128, const int); __n128 neon_shrn2_64 (__n64, __n128, const int); __n64 neon_rshrn_16 (__n128, const int); __n128 neon_rshrn2_16 (__n64, __n128, const int); __n64 neon_rshrn_32 (__n128, const int); __n128 neon_rshrn2_32 (__n64, __n128, const int); __n64 neon_rshrn_64 (__n128, const int); __n128 neon_rshrn2_64 (__n64, __n128, const int); __n64 neon_sqshrn_16 (__n128, const int); __n128 neon_sqshrn2_16 (__n64, __n128, const int); __n64 neon_sqshrn_32 (__n128, const int); __n128 neon_sqshrn2_32 (__n64, __n128, const int); __n64 neon_sqshrn_64 (__n128, const int); __n128 neon_sqshrn2_64 (__n64, __n128, const int); __n64 neon_sqrshrn_16 (__n128, const int); __n128 neon_sqrshrn2_16 (__n64, __n128, const int); __n64 neon_sqrshrn_32 (__n128, const int); __n128 neon_sqrshrn2_32 (__n64, __n128, const int); __n64 neon_sqrshrn_64 (__n128, const int); __n128 neon_sqrshrn2_64 (__n64, __n128, const int); __n64 neon_uqshrn_16 (__n128, const int); __n128 neon_uqshrn2_16 (__n64, __n128, const int); __n64 neon_uqshrn_32 (__n128, const int); __n128 neon_uqshrn2_32 (__n64, __n128, const int); __n64 neon_uqshrn_64 (__n128, const int); __n128 neon_uqshrn2_64 (__n64, __n128, const int); __n64 neon_uqrshrn_16 (__n128, const int); __n128 neon_uqrshrn2_16 (__n64, __n128, const int); __n64 neon_uqrshrn_32 (__n128, const int); __n128 neon_uqrshrn2_32 (__n64, __n128, const int); __n64 neon_uqrshrn_64 (__n128, const int); __n128 neon_uqrshrn2_64 (__n64, __n128, const int); __n64 neon_sqshrun_16 (__n128, const int); __n128 neon_sqshrun2_16 (__n64, __n128, const int); __n64 neon_sqshrun_32 (__n128, const int); __n128 neon_sqshrun2_32 (__n64, __n128, const int); __n64 neon_sqshrun_64 (__n128, const int); __n128 neon_sqshrun2_64 (__n64, __n128, const int); __n64 neon_sqrshrun_16 (__n128, const int); __n128 neon_sqrshrun2_16(__n64, __n128, const int); __n64 neon_sqrshrun_32 (__n128, const int); __n128 neon_sqrshrun2_32(__n64, __n128, const int); __n64 neon_sqrshrun_64 (__n128, const int); __n128 neon_sqrshrun2_64(__n64, __n128, const int); __n8 neon_sqshrn_s16 (__n16, const int); __n16 neon_sqshrn_s32 (float, const int); float neon_sqshrn_s64 (__n64, const int); __n8 neon_sqrshrn_s16 (__n16, const int); __n16 neon_sqrshrn_s32 (float, const int); float neon_sqrshrn_s64 (__n64, const int); __n8 neon_uqshrn_s16 (__n16, const int); __n16 neon_uqshrn_s32 (float, const int); float neon_uqshrn_s64 (__n64, const int); __n8 neon_uqrshrn_s16 (__n16, const int); __n16 neon_uqrshrn_s32 (float, const int); float neon_uqrshrn_s64 (__n64, const int); __n8 neon_sqshrun_s16 (__n16, const int); __n16 neon_sqshrun_s32 (float, const int); float neon_sqshrun_s64 (__n64, const int); __n8 neon_sqrshrun_s16(__n16, const int); __n16 neon_sqrshrun_s32(float, const int); float neon_sqrshrun_s64(__n64, const int); #define vshrn_n_s16(src1, src2) neon_shrn_16(src1, src2) #define vshrn_n_s32(src1, src2) neon_shrn_32(src1, src2) #define vshrn_n_s64(src1, src2) neon_shrn_64(src1, src2) #define vshrn_n_u16(src1, src2) neon_shrn_16(src1, src2) #define vshrn_n_u32(src1, src2) neon_shrn_32(src1, src2) #define vshrn_n_u64(src1, src2) neon_shrn_64(src1, src2) #define vshrn_high_n_s16(src1, src2, src3) neon_shrn2_16(src1, src2, src3) #define vshrn_high_n_s32(src1, src2, src3) neon_shrn2_32(src1, src2, src3) #define vshrn_high_n_s64(src1, src2, src3) neon_shrn2_64(src1, src2, src3) #define vshrn_high_n_u16(src1, src2, src3) neon_shrn2_16(src1, src2, src3) #define vshrn_high_n_u32(src1, src2, src3) neon_shrn2_32(src1, src2, src3) #define vshrn_high_n_u64(src1, src2, src3) neon_shrn2_64(src1, src2, src3) #define vrshrn_n_s16(src1, src2) neon_rshrn_16(src1, src2) #define vrshrn_n_s32(src1, src2) neon_rshrn_32(src1, src2) #define vrshrn_n_s64(src1, src2) neon_rshrn_64(src1, src2) #define vrshrn_n_u16(src1, src2) neon_rshrn_16(src1, src2) #define vrshrn_n_u32(src1, src2) neon_rshrn_32(src1, src2) #define vrshrn_n_u64(src1, src2) neon_rshrn_64(src1, src2) #define vrshrn_high_n_s16(src1, src2, src3) neon_rshrn2_16(src1, src2, src3) #define vrshrn_high_n_s32(src1, src2, src3) neon_rshrn2_32(src1, src2, src3) #define vrshrn_high_n_s64(src1, src2, src3) neon_rshrn2_64(src1, src2, src3) #define vrshrn_high_n_u16(src1, src2, src3) neon_rshrn2_16(src1, src2, src3) #define vrshrn_high_n_u32(src1, src2, src3) neon_rshrn2_32(src1, src2, src3) #define vrshrn_high_n_u64(src1, src2, src3) neon_rshrn2_64(src1, src2, src3) #define vqshrn_n_s16(src1, src2) neon_sqshrn_16(src1, src2) #define vqshrn_n_s32(src1, src2) neon_sqshrn_32(src1, src2) #define vqshrn_n_s64(src1, src2) neon_sqshrn_64(src1, src2) #define vqshrn_n_u16(src1, src2) neon_uqshrn_16(src1, src2) #define vqshrn_n_u32(src1, src2) neon_uqshrn_32(src1, src2) #define vqshrn_n_u64(src1, src2) neon_uqshrn_64(src1, src2) #define vqshrn_high_n_s16(src1, src2, src3) neon_sqshrn2_16(src1, src2, src3) #define vqshrn_high_n_s32(src1, src2, src3) neon_sqshrn2_32(src1, src2, src3) #define vqshrn_high_n_s64(src1, src2, src3) neon_sqshrn2_64(src1, src2, src3) #define vqshrn_high_n_u16(src1, src2, src3) neon_uqshrn2_16(src1, src2, src3) #define vqshrn_high_n_u32(src1, src2, src3) neon_uqshrn2_32(src1, src2, src3) #define vqshrn_high_n_u64(src1, src2, src3) neon_uqshrn2_64(src1, src2, src3) #define vqrshrn_n_s16(src1, src2) neon_sqrshrn_16(src1, src2) #define vqrshrn_n_s32(src1, src2) neon_sqrshrn_32(src1, src2) #define vqrshrn_n_s64(src1, src2) neon_sqrshrn_64(src1, src2) #define vqrshrn_n_u16(src1, src2) neon_uqrshrn_16(src1, src2) #define vqrshrn_n_u32(src1, src2) neon_uqrshrn_32(src1, src2) #define vqrshrn_n_u64(src1, src2) neon_uqrshrn_64(src1, src2) #define vqrshrn_high_n_s16(src1, src2, src3) neon_sqrshrn2_16(src1, src2, src3) #define vqrshrn_high_n_s32(src1, src2, src3) neon_sqrshrn2_32(src1, src2, src3) #define vqrshrn_high_n_s64(src1, src2, src3) neon_sqrshrn2_64(src1, src2, src3) #define vqrshrn_high_n_u16(src1, src2, src3) neon_uqrshrn2_16(src1, src2, src3) #define vqrshrn_high_n_u32(src1, src2, src3) neon_uqrshrn2_32(src1, src2, src3) #define vqrshrn_high_n_u64(src1, src2, src3) neon_uqrshrn2_64(src1, src2, src3) #define vqshrun_n_s16(src1, src2) neon_sqshrun_16(src1, src2) #define vqshrun_n_s32(src1, src2) neon_sqshrun_32(src1, src2) #define vqshrun_n_s64(src1, src2) neon_sqshrun_64(src1, src2) #define vqshrun_high_n_s16(src1, src2, src3) neon_sqshrun2_16(src1, src2, src3) #define vqshrun_high_n_s32(src1, src2, src3) neon_sqshrun2_32(src1, src2, src3) #define vqshrun_high_n_s64(src1, src2, src3) neon_sqshrun2_64(src1, src2, src3) #define vqrshrun_n_s16(src1, src2) neon_sqrshrun_16(src1, src2) #define vqrshrun_n_s32(src1, src2) neon_sqrshrun_32(src1, src2) #define vqrshrun_n_s64(src1, src2) neon_sqrshrun_64(src1, src2) #define vqrshrun_high_n_s16(src1, src2, src3) neon_sqrshrun2_16(src1, src2, src3) #define vqrshrun_high_n_s32(src1, src2, src3) neon_sqrshrun2_32(src1, src2, src3) #define vqrshrun_high_n_s64(src1, src2, src3) neon_sqrshrun2_64(src1, src2, src3) #define vqshrnh_n_s16(src1, src2) neon_sqshrn_s16(src1, src2) #define vqshrnh_n_s32(src1, src2) neon_sqshrn_s32(src1, src2) #define vqshrnh_n_s64(src1, src2) neon_sqshrn_s64(src1, src2) #define vqrshrnh_n_s16(src1, src2) neon_sqrshrn_s16(src1, src2) #define vqrshrnh_n_s32(src1, src2) neon_sqrshrn_s32(src1, src2) #define vqrshrnh_n_s64(src1, src2) neon_sqrshrn_s64(src1, src2) #define vqshrunh_n_s16(src1, src2) neon_sqshrun_s16(src1, src2) #define vqshrunh_n_s32(src1, src2) neon_sqshrun_s32(src1, src2) #define vqshrunh_n_s64(src1, src2) neon_sqshrun_s64(src1, src2) #define vqrshrunh_n_s16(src1, src2) neon_sqrshrun_s16(src1, src2) #define vqrshrunh_n_s32(src1, src2) neon_sqrshrun_s32(src1, src2) #define vqrshrunh_n_s64(src1, src2) neon_sqrshrun_s64(src1, src2) // ADDHN/RADDHN/SADDW/UADDW/SADDL/UADDL __n64 neon_addhn_16 (__n128, __n128); __n128 neon_addhn2_16 (__n64, __n128, __n128); __n64 neon_addhn_32 (__n128, __n128); __n128 neon_addhn2_32 (__n64, __n128, __n128); __n64 neon_addhn_64 (__n128, __n128); __n128 neon_addhn2_64 (__n64, __n128, __n128); __n64 neon_raddhn_16 (__n128, __n128); __n128 neon_raddhn2_16 (__n64, __n128, __n128); __n64 neon_raddhn_32 (__n128, __n128); __n128 neon_raddhn2_32 (__n64, __n128, __n128); __n64 neon_raddhn_64 (__n128, __n128); __n128 neon_raddhn2_64 (__n64, __n128, __n128); __n128 neon_saddw_8 (__n128, __n64); __n128 neon_saddw2_8 (__n128, __n128); __n128 neon_saddw_16 (__n128, __n64); __n128 neon_saddw2_16 (__n128, __n128); __n128 neon_saddw_32 (__n128, __n64); __n128 neon_saddw2_32 (__n128, __n128); __n128 neon_uaddw_8 (__n128, __n64); __n128 neon_uaddw2_8 (__n128, __n128); __n128 neon_uaddw_16 (__n128, __n64); __n128 neon_uaddw2_16 (__n128, __n128); __n128 neon_uaddw_32 (__n128, __n64); __n128 neon_uaddw2_32 (__n128, __n128); __n128 neon_saddl_8 (__n64, __n64); __n128 neon_saddl2_8 (__n128, __n128); __n128 neon_saddl_16 (__n64, __n64); __n128 neon_saddl2_16 (__n128, __n128); __n128 neon_saddl_32 (__n64, __n64); __n128 neon_saddl2_32 (__n128, __n128); __n128 neon_uaddl_8 (__n64, __n64); __n128 neon_uaddl2_8 (__n128, __n128); __n128 neon_uaddl_16 (__n64, __n64); __n128 neon_uaddl2_16 (__n128, __n128); __n128 neon_uaddl_32 (__n64, __n64); __n128 neon_uaddl2_32 (__n128, __n128); #define vaddhn_s16(src1, src2) neon_addhn_16(src1, src2) #define vaddhn_s32(src1, src2) neon_addhn_32(src1, src2) #define vaddhn_s64(src1, src2) neon_addhn_64(src1, src2) #define vaddhn_u16(src1, src2) neon_addhn_16(src1, src2) #define vaddhn_u32(src1, src2) neon_addhn_32(src1, src2) #define vaddhn_u64(src1, src2) neon_addhn_64(src1, src2) #define vaddhn_high_s16(src1, src2, src3) neon_addhn2_16(src1, src2, src3) #define vaddhn_high_s32(src1, src2, src3) neon_addhn2_32(src1, src2, src3) #define vaddhn_high_s64(src1, src2, src3) neon_addhn2_64(src1, src2, src3) #define vaddhn_high_u16(src1, src2, src3) neon_addhn2_16(src1, src2, src3) #define vaddhn_high_u32(src1, src2, src3) neon_addhn2_32(src1, src2, src3) #define vaddhn_high_u64(src1, src2, src3) neon_addhn2_64(src1, src2, src3) #define vraddhn_s16(src1, src2) neon_raddhn_16(src1, src2) #define vraddhn_s32(src1, src2) neon_raddhn_32(src1, src2) #define vraddhn_s64(src1, src2) neon_raddhn_64(src1, src2) #define vraddhn_u16(src1, src2) neon_raddhn_16(src1, src2) #define vraddhn_u32(src1, src2) neon_raddhn_32(src1, src2) #define vraddhn_u64(src1, src2) neon_raddhn_64(src1, src2) #define vraddhn_high_s16(src1, src2, src3) neon_raddhn2_16(src1, src2, src3) #define vraddhn_high_s32(src1, src2, src3) neon_raddhn2_32(src1, src2, src3) #define vraddhn_high_s64(src1, src2, src3) neon_raddhn2_64(src1, src2, src3) #define vraddhn_high_u16(src1, src2, src3) neon_raddhn2_16(src1, src2, src3) #define vraddhn_high_u32(src1, src2, src3) neon_raddhn2_32(src1, src2, src3) #define vraddhn_high_u64(src1, src2, src3) neon_raddhn2_64(src1, src2, src3) #define vaddw_s8(src1, src2) neon_saddw_8(src1, src2) #define vaddw_s16(src1, src2) neon_saddw_16(src1, src2) #define vaddw_s32(src1, src2) neon_saddw_32(src1, src2) #define vaddw_u8(src1, src2) neon_uaddw_8(src1, src2) #define vaddw_u16(src1, src2) neon_uaddw_16(src1, src2) #define vaddw_u32(src1, src2) neon_uaddw_32(src1, src2) #define vaddl_s8(src1, src2) neon_saddl_8(src1, src2) #define vaddl_s16(src1, src2) neon_saddl_16(src1, src2) #define vaddl_s32(src1, src2) neon_saddl_32(src1, src2) #define vaddl_u8(src1, src2) neon_uaddl_8(src1, src2) #define vaddl_u16(src1, src2) neon_uaddl_16(src1, src2) #define vaddl_u32(src1, src2) neon_uaddl_32(src1, src2) #define vaddw_high_s8(src1, src2) neon_saddw2_8(src1, src2) #define vaddw_high_s16(src1, src2) neon_saddw2_16(src1, src2) #define vaddw_high_s32(src1, src2) neon_saddw2_32(src1, src2) #define vaddw_high_u8(src1, src2) neon_uaddw2_8(src1, src2) #define vaddw_high_u16(src1, src2) neon_uaddw2_16(src1, src2) #define vaddw_high_u32(src1, src2) neon_uaddw2_32(src1, src2) #define vaddl_high_s8(src1, src2) neon_saddl2_8(src1, src2) #define vaddl_high_s16(src1, src2) neon_saddl2_16(src1, src2) #define vaddl_high_s32(src1, src2) neon_saddl2_32(src1, src2) #define vaddl_high_u8(src1, src2) neon_uaddl2_8(src1, src2) #define vaddl_high_u16(src1, src2) neon_uaddl2_16(src1, src2) #define vaddl_high_u32(src1, src2) neon_uaddl2_32(src1, src2) // SUBHN/RSUBHN/SSUBW/USUBW/SSUBL/USUBL __n64 neon_subhn_16(__n128, __n128); __n128 neon_subhn2_16(__n64, __n128, __n128); __n64 neon_subhn_32(__n128, __n128); __n128 neon_subhn2_32(__n64, __n128, __n128); __n64 neon_subhn_64(__n128, __n128); __n128 neon_subhn2_64(__n64, __n128, __n128); __n64 neon_rsubhn_16(__n128, __n128); __n128 neon_rsubhn2_16(__n64, __n128, __n128); __n64 neon_rsubhn_32(__n128, __n128); __n128 neon_rsubhn2_32(__n64, __n128, __n128); __n64 neon_rsubhn_64(__n128, __n128); __n128 neon_rsubhn2_64(__n64, __n128, __n128); __n128 neon_ssubw_8(__n128, __n64); __n128 neon_ssubw2_8(__n128, __n128); __n128 neon_ssubw_16(__n128, __n64); __n128 neon_ssubw2_16(__n128, __n128); __n128 neon_ssubw_32(__n128, __n64); __n128 neon_ssubw2_32(__n128, __n128); __n128 neon_usubw_8(__n128, __n64); __n128 neon_usubw2_8(__n128, __n128); __n128 neon_usubw_16(__n128, __n64); __n128 neon_usubw2_16(__n128, __n128); __n128 neon_usubw_32(__n128, __n64); __n128 neon_usubw2_32(__n128, __n128); __n128 neon_ssubl_8(__n64, __n64); __n128 neon_ssubl2_8(__n128, __n128); __n128 neon_ssubl_16(__n64, __n64); __n128 neon_ssubl2_16(__n128, __n128); __n128 neon_ssubl_32(__n64, __n64); __n128 neon_ssubl2_32(__n128, __n128); __n128 neon_usubl_8(__n64, __n64); __n128 neon_usubl2_8(__n128, __n128); __n128 neon_usubl_16(__n64, __n64); __n128 neon_usubl2_16(__n128, __n128); __n128 neon_usubl_32(__n64, __n64); __n128 neon_usubl2_32(__n128, __n128); #define vsubhn_s16(src1, src2) neon_subhn_16(src1, src2) #define vsubhn_s32(src1, src2) neon_subhn_32(src1, src2) #define vsubhn_s64(src1, src2) neon_subhn_64(src1, src2) #define vsubhn_u16(src1, src2) neon_subhn_16(src1, src2) #define vsubhn_u32(src1, src2) neon_subhn_32(src1, src2) #define vsubhn_u64(src1, src2) neon_subhn_64(src1, src2) #define vsubhn_high_s16(src1, src2, src3) neon_subhn2_16(src1, src2, src3) #define vsubhn_high_s32(src1, src2, src3) neon_subhn2_32(src1, src2, src3) #define vsubhn_high_s64(src1, src2, src3) neon_subhn2_64(src1, src2, src3) #define vsubhn_high_u16(src1, src2, src3) neon_subhn2_16(src1, src2, src3) #define vsubhn_high_u32(src1, src2, src3) neon_subhn2_32(src1, src2, src3) #define vsubhn_high_u64(src1, src2, src3) neon_subhn2_64(src1, src2, src3) #define vrsubhn_s16(src1, src2) neon_rsubhn_16(src1, src2) #define vrsubhn_s32(src1, src2) neon_rsubhn_32(src1, src2) #define vrsubhn_s64(src1, src2) neon_rsubhn_64(src1, src2) #define vrsubhn_u16(src1, src2) neon_rsubhn_16(src1, src2) #define vrsubhn_u32(src1, src2) neon_rsubhn_32(src1, src2) #define vrsubhn_u64(src1, src2) neon_rsubhn_64(src1, src2) #define vrsubhn_high_s16(src1, src2, src3) neon_rsubhn2_16(src1, src2, src3) #define vrsubhn_high_s32(src1, src2, src3) neon_rsubhn2_32(src1, src2, src3) #define vrsubhn_high_s64(src1, src2, src3) neon_rsubhn2_64(src1, src2, src3) #define vrsubhn_high_u16(src1, src2, src3) neon_rsubhn2_16(src1, src2, src3) #define vrsubhn_high_u32(src1, src2, src3) neon_rsubhn2_32(src1, src2, src3) #define vrsubhn_high_u64(src1, src2, src3) neon_rsubhn2_64(src1, src2, src3) #define vsubw_s8(src1, src2) neon_ssubw_8(src1, src2) #define vsubw_s16(src1, src2) neon_ssubw_16(src1, src2) #define vsubw_s32(src1, src2) neon_ssubw_32(src1, src2) #define vsubw_u8(src1, src2) neon_usubw_8(src1, src2) #define vsubw_u16(src1, src2) neon_usubw_16(src1, src2) #define vsubw_u32(src1, src2) neon_usubw_32(src1, src2) #define vsubl_s8(src1, src2) neon_ssubl_8(src1, src2) #define vsubl_s16(src1, src2) neon_ssubl_16(src1, src2) #define vsubl_s32(src1, src2) neon_ssubl_32(src1, src2) #define vsubl_u8(src1, src2) neon_usubl_8(src1, src2) #define vsubl_u16(src1, src2) neon_usubl_16(src1, src2) #define vsubl_u32(src1, src2) neon_usubl_32(src1, src2) #define vsubw_high_s8(src1, src2) neon_ssubw2_8(src1, src2) #define vsubw_high_s16(src1, src2) neon_ssubw2_16(src1, src2) #define vsubw_high_s32(src1, src2) neon_ssubw2_32(src1, src2) #define vsubw_high_u8(src1, src2) neon_usubw2_8(src1, src2) #define vsubw_high_u16(src1, src2) neon_usubw2_16(src1, src2) #define vsubw_high_u32(src1, src2) neon_usubw2_32(src1, src2) #define vsubl_high_s8(src1, src2) neon_ssubl2_8(src1, src2) #define vsubl_high_s16(src1, src2) neon_ssubl2_16(src1, src2) #define vsubl_high_s32(src1, src2) neon_ssubl2_32(src1, src2) #define vsubl_high_u8(src1, src2) neon_usubl2_8(src1, src2) #define vsubl_high_u16(src1, src2) neon_usubl2_16(src1, src2) #define vsubl_high_u32(src1, src2) neon_usubl2_32(src1, src2) // SABAL/UABAL/SABDL/UABDL __n128 neon_sabal_8 (__n128, __n64, __n64); __n128 neon_sabal2_8 (__n128, __n128, __n128); __n128 neon_sabal_16 (__n128, __n64, __n64); __n128 neon_sabal2_16(__n128, __n128, __n128); __n128 neon_sabal_32 (__n128, __n64, __n64); __n128 neon_sabal2_32(__n128, __n128, __n128); __n128 neon_uabal_8 (__n128, __n64, __n64); __n128 neon_uabal2_8 (__n128, __n128, __n128); __n128 neon_uabal_16 (__n128, __n64, __n64); __n128 neon_uabal2_16(__n128, __n128, __n128); __n128 neon_uabal_32 (__n128, __n64, __n64); __n128 neon_uabal2_32(__n128, __n128, __n128); __n128 neon_sabdl_8 (__n64, __n64); __n128 neon_sabdl2_8 (__n128, __n128); __n128 neon_sabdl_16 (__n64, __n64); __n128 neon_sabdl2_16(__n128, __n128); __n128 neon_sabdl_32 (__n64, __n64); __n128 neon_sabdl2_32(__n128, __n128); __n128 neon_uabdl_8 (__n64, __n64); __n128 neon_uabdl2_8 (__n128, __n128); __n128 neon_uabdl_16 (__n64, __n64); __n128 neon_uabdl2_16(__n128, __n128); __n128 neon_uabdl_32 (__n64, __n64); __n128 neon_uabdl2_32(__n128, __n128); #define vabal_s8(src1, src2, src3) neon_sabal_8(src1, src2, src3) #define vabal_s16(src1, src2, src3) neon_sabal_16(src1, src2, src3) #define vabal_s32(src1, src2, src3) neon_sabal_32(src1, src2, src3) #define vabal_u8(src1, src2, src3) neon_uabal_8(src1, src2, src3) #define vabal_u16(src1, src2, src3) neon_uabal_16(src1, src2, src3) #define vabal_u32(src1, src2, src3) neon_uabal_32(src1, src2, src3) #define vabal_high_s8(src1, src2, src3) neon_sabal2_8(src1, src2, src3) #define vabal_high_s16(src1, src2, src3) neon_sabal2_16(src1, src2, src3) #define vabal_high_s32(src1, src2, src3) neon_sabal2_32(src1, src2, src3) #define vabal_high_u8(src1, src2, src3) neon_uabal2_8(src1, src2, src3) #define vabal_high_u16(src1, src2, src3) neon_uabal2_16(src1, src2, src3) #define vabal_high_u32(src1, src2, src3) neon_uabal2_32(src1, src2, src3) #define vabdl_s8(src1, src2) neon_sabdl_8(src1, src2) #define vabdl_s16(src1, src2) neon_sabdl_16(src1, src2) #define vabdl_s32(src1, src2) neon_sabdl_32(src1, src2) #define vabdl_u8(src1, src2) neon_uabdl_8(src1, src2) #define vabdl_u16(src1, src2) neon_uabdl_16(src1, src2) #define vabdl_u32(src1, src2) neon_uabdl_32(src1, src2) #define vabdl_high_s8(src1, src2) neon_sabdl2_8(src1, src2) #define vabdl_high_s16(src1, src2) neon_sabdl2_16(src1, src2) #define vabdl_high_s32(src1, src2) neon_sabdl2_32(src1, src2) #define vabdl_high_u8(src1, src2) neon_uabdl2_8(src1, src2) #define vabdl_high_u16(src1, src2) neon_uabdl2_16(src1, src2) #define vabdl_high_u32(src1, src2) neon_uabdl2_32(src1, src2) // vget_low/vget_high/vcombine #define vget_high_u8(src) neon_dups64q(src, 1) #define vget_high_s8(src) neon_dups64q(src, 1) #define vget_low_u8(src) neon_dups64q(src, 0) #define vget_low_s8(src) neon_dups64q(src, 0) #define vget_high_u16(src) neon_dups64q(src, 1) #define vget_high_s16(src) neon_dups64q(src, 1) #define vget_low_u16(src) neon_dups64q(src, 0) #define vget_low_s16(src) neon_dups64q(src, 0) #define vget_high_u32(src) neon_dups64q(src, 1) #define vget_high_s32(src) neon_dups64q(src, 1) #define vget_low_u32(src) neon_dups64q(src, 0) #define vget_low_s32(src) neon_dups64q(src, 0) #define vget_high_u64(src) neon_dups64q(src, 1) #define vget_high_s64(src) neon_dups64q(src, 1) #define vget_low_u64(src) neon_dups64q(src, 0) #define vget_low_s64(src) neon_dups64q(src, 0) #define vget_high_p8(src) neon_dups64q(src, 1) #define vget_high_p16(src) neon_dups64q(src, 1) #define vget_high_p64(src) neon_dups64q(src, 1) #define vget_low_p8(src) neon_dups64q(src, 0) #define vget_low_p16(src) neon_dups64q(src, 0) #define vget_low_p64(src) neon_dups64q(src, 0) #define vget_high_f32(src) neon_dups64q(src, 1) #define vget_high_f16(src) neon_dups64q(src, 1) #define vget_high_f64(src) neon_dups64q(src, 1) #define vget_low_f32(src) neon_dups64q(src, 0) #define vget_low_f16(src) neon_dups64q(src, 0) #define vget_low_f64(src) neon_dups64q(src, 0) #define vcombine_u8(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_s8(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_p8(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_u16(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_s16(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_p16(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_f16(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_u32(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_s32(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_f32(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_u64(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_s64(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_p64(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) #define vcombine_f64(low, high) neon_insqr64(neon_dupqr64(vget_lane_u64(low, 0)), 1, vget_lane_u64(high, 0)) // VCREATE __n64 vcreate(__int64 src); #define vcreate_s8(src) vcreate(src) #define vcreate_s16(src) vcreate(src) #define vcreate_s32(src) vcreate(src) #define vcreate_s64(src) vcreate(src) #define vcreate_u8(src) vcreate(src) #define vcreate_u16(src) vcreate(src) #define vcreate_u32(src) vcreate(src) #define vcreate_u64(src) vcreate(src) #define vcreate_p64(src) vcreate(src) #define vcreate_p16(src) vcreate(src) #define vcreate_p8(src) vcreate(src) #define vcreate_f16(src) vcreate(src) #define vcreate_f32(src) vcreate(src) #define vcreate_f64(src) vcreate(src) #define vreinterpret_f32_s8(a) (a) #define vreinterpret_f32_s16(a) (a) #define vreinterpret_f32_s32(a) (a) #define vreinterpret_f32_s64(a) (a) #define vreinterpret_f32_p8(a) (a) #define vreinterpret_f32_p16(a) (a) #define vreinterpret_f32_u8(a) (a) #define vreinterpret_f32_u16(a) (a) #define vreinterpret_f32_u32(a) (a) #define vreinterpret_f32_u64(a) (a) #define vreinterpret_s8_f32(a) (a) #define vreinterpret_s8_s16(a) (a) #define vreinterpret_s8_s32(a) (a) #define vreinterpret_s8_s64(a) (a) #define vreinterpret_s8_p8(a) (a) #define vreinterpret_s8_p16(a) (a) #define vreinterpret_s8_u8(a) (a) #define vreinterpret_s8_u16(a) (a) #define vreinterpret_s8_u32(a) (a) #define vreinterpret_s8_u64(a) (a) #define vreinterpret_s16_f32(a) (a) #define vreinterpret_s16_s8(a) (a) #define vreinterpret_s16_s32(a) (a) #define vreinterpret_s16_s64(a) (a) #define vreinterpret_s16_p8(a) (a) #define vreinterpret_s16_p16(a) (a) #define vreinterpret_s16_u8(a) (a) #define vreinterpret_s16_u16(a) (a) #define vreinterpret_s16_u32(a) (a) #define vreinterpret_s16_u64(a) (a) #define vreinterpret_s32_f32(a) (a) #define vreinterpret_s32_s8(a) (a) #define vreinterpret_s32_s16(a) (a) #define vreinterpret_s32_s64(a) (a) #define vreinterpret_s32_p8(a) (a) #define vreinterpret_s32_p16(a) (a) #define vreinterpret_s32_u8(a) (a) #define vreinterpret_s32_u16(a) (a) #define vreinterpret_s32_u32(a) (a) #define vreinterpret_s32_u64(a) (a) #define vreinterpret_s64_f32(a) (a) #define vreinterpret_s64_s8(a) (a) #define vreinterpret_s64_s16(a) (a) #define vreinterpret_s64_s32(a) (a) #define vreinterpret_s64_p8(a) (a) #define vreinterpret_s64_p16(a) (a) #define vreinterpret_s64_u8(a) (a) #define vreinterpret_s64_u16(a) (a) #define vreinterpret_s64_u32(a) (a) #define vreinterpret_s64_u64(a) (a) #define vreinterpret_p8_f32(a) (a) #define vreinterpret_p8_s8(a) (a) #define vreinterpret_p8_s16(a) (a) #define vreinterpret_p8_s32(a) (a) #define vreinterpret_p8_s64(a) (a) #define vreinterpret_p8_p16(a) (a) #define vreinterpret_p8_u8(a) (a) #define vreinterpret_p8_u16(a) (a) #define vreinterpret_p8_u32(a) (a) #define vreinterpret_p8_u64(a) (a) #define vreinterpret_p16_f32(a) (a) #define vreinterpret_p16_s8(a) (a) #define vreinterpret_p16_s16(a) (a) #define vreinterpret_p16_s32(a) (a) #define vreinterpret_p16_s64(a) (a) #define vreinterpret_p16_p8(a) (a) #define vreinterpret_p16_u8(a) (a) #define vreinterpret_p16_u16(a) (a) #define vreinterpret_p16_u32(a) (a) #define vreinterpret_p16_u64(a) (a) #define vreinterpret_u8_f32(a) (a) #define vreinterpret_u8_s8(a) (a) #define vreinterpret_u8_s16(a) (a) #define vreinterpret_u8_s32(a) (a) #define vreinterpret_u8_s64(a) (a) #define vreinterpret_u8_p8(a) (a) #define vreinterpret_u8_p16(a) (a) #define vreinterpret_u8_u16(a) (a) #define vreinterpret_u8_u32(a) (a) #define vreinterpret_u8_u64(a) (a) #define vreinterpret_u16_f32(a) (a) #define vreinterpret_u16_s8(a) (a) #define vreinterpret_u16_s16(a) (a) #define vreinterpret_u16_s32(a) (a) #define vreinterpret_u16_s64(a) (a) #define vreinterpret_u16_p8(a) (a) #define vreinterpret_u16_p16(a) (a) #define vreinterpret_u16_u8(a) (a) #define vreinterpret_u16_u32(a) (a) #define vreinterpret_u16_u64(a) (a) #define vreinterpret_u32_f32(a) (a) #define vreinterpret_u32_s8(a) (a) #define vreinterpret_u32_s16(a) (a) #define vreinterpret_u32_s32(a) (a) #define vreinterpret_u32_s64(a) (a) #define vreinterpret_u32_p8(a) (a) #define vreinterpret_u32_p16(a) (a) #define vreinterpret_u32_u8(a) (a) #define vreinterpret_u32_u16(a) (a) #define vreinterpret_u32_u64(a) (a) #define vreinterpret_u64_f32(a) (a) #define vreinterpret_u64_s8(a) (a) #define vreinterpret_u64_s16(a) (a) #define vreinterpret_u64_s32(a) (a) #define vreinterpret_u64_s64(a) (a) #define vreinterpret_u64_p8(a) (a) #define vreinterpret_u64_p16(a) (a) #define vreinterpret_u64_u8(a) (a) #define vreinterpret_u64_u16(a) (a) #define vreinterpret_u64_u32(a) (a) #define vreinterpretq_f32_s8(a) (a) #define vreinterpretq_f32_s16(a) (a) #define vreinterpretq_f32_s32(a) (a) #define vreinterpretq_f32_s64(a) (a) #define vreinterpretq_f32_p8(a) (a) #define vreinterpretq_f32_p16(a) (a) #define vreinterpretq_f32_u8(a) (a) #define vreinterpretq_f32_u16(a) (a) #define vreinterpretq_f32_u32(a) (a) #define vreinterpretq_f32_u64(a) (a) #define vreinterpretq_s8_f32(a) (a) #define vreinterpretq_s8_s16(a) (a) #define vreinterpretq_s8_s32(a) (a) #define vreinterpretq_s8_s64(a) (a) #define vreinterpretq_s8_p8(a) (a) #define vreinterpretq_s8_p16(a) (a) #define vreinterpretq_s8_u8(a) (a) #define vreinterpretq_s8_u16(a) (a) #define vreinterpretq_s8_u32(a) (a) #define vreinterpretq_s8_u64(a) (a) #define vreinterpretq_s16_f32(a) (a) #define vreinterpretq_s16_s8(a) (a) #define vreinterpretq_s16_s32(a) (a) #define vreinterpretq_s16_s64(a) (a) #define vreinterpretq_s16_p8(a) (a) #define vreinterpretq_s16_p16(a) (a) #define vreinterpretq_s16_u8(a) (a) #define vreinterpretq_s16_u16(a) (a) #define vreinterpretq_s16_u32(a) (a) #define vreinterpretq_s16_u64(a) (a) #define vreinterpretq_s32_f32(a) (a) #define vreinterpretq_s32_s8(a) (a) #define vreinterpretq_s32_s16(a) (a) #define vreinterpretq_s32_s64(a) (a) #define vreinterpretq_s32_p8(a) (a) #define vreinterpretq_s32_p16(a) (a) #define vreinterpretq_s32_u8(a) (a) #define vreinterpretq_s32_u16(a) (a) #define vreinterpretq_s32_u32(a) (a) #define vreinterpretq_s32_u64(a) (a) #define vreinterpretq_s64_f32(a) (a) #define vreinterpretq_s64_s8(a) (a) #define vreinterpretq_s64_s16(a) (a) #define vreinterpretq_s64_s32(a) (a) #define vreinterpretq_s64_p8(a) (a) #define vreinterpretq_s64_p16(a) (a) #define vreinterpretq_s64_u8(a) (a) #define vreinterpretq_s64_u16(a) (a) #define vreinterpretq_s64_u32(a) (a) #define vreinterpretq_s64_u64(a) (a) #define vreinterpretq_p8_f32(a) (a) #define vreinterpretq_p8_s8(a) (a) #define vreinterpretq_p8_s16(a) (a) #define vreinterpretq_p8_s32(a) (a) #define vreinterpretq_p8_s64(a) (a) #define vreinterpretq_p8_p16(a) (a) #define vreinterpretq_p8_u8(a) (a) #define vreinterpretq_p8_u16(a) (a) #define vreinterpretq_p8_u32(a) (a) #define vreinterpretq_p8_u64(a) (a) #define vreinterpretq_p16_f32(a) (a) #define vreinterpretq_p16_s8(a) (a) #define vreinterpretq_p16_s16(a) (a) #define vreinterpretq_p16_s32(a) (a) #define vreinterpretq_p16_s64(a) (a) #define vreinterpretq_p16_p8(a) (a) #define vreinterpretq_p16_u8(a) (a) #define vreinterpretq_p16_u16(a) (a) #define vreinterpretq_p16_u32(a) (a) #define vreinterpretq_p16_u64(a) (a) #define vreinterpretq_u8_f32(a) (a) #define vreinterpretq_u8_s8(a) (a) #define vreinterpretq_u8_s16(a) (a) #define vreinterpretq_u8_s32(a) (a) #define vreinterpretq_u8_s64(a) (a) #define vreinterpretq_u8_p8(a) (a) #define vreinterpretq_u8_p16(a) (a) #define vreinterpretq_u8_u16(a) (a) #define vreinterpretq_u8_u32(a) (a) #define vreinterpretq_u8_u64(a) (a) #define vreinterpretq_u16_f32(a) (a) #define vreinterpretq_u16_s8(a) (a) #define vreinterpretq_u16_s16(a) (a) #define vreinterpretq_u16_s32(a) (a) #define vreinterpretq_u16_s64(a) (a) #define vreinterpretq_u16_p8(a) (a) #define vreinterpretq_u16_p16(a) (a) #define vreinterpretq_u16_u8(a) (a) #define vreinterpretq_u16_u32(a) (a) #define vreinterpretq_u16_u64(a) (a) #define vreinterpretq_u32_f32(a) (a) #define vreinterpretq_u32_s8(a) (a) #define vreinterpretq_u32_s16(a) (a) #define vreinterpretq_u32_s32(a) (a) #define vreinterpretq_u32_s64(a) (a) #define vreinterpretq_u32_p8(a) (a) #define vreinterpretq_u32_p16(a) (a) #define vreinterpretq_u32_u8(a) (a) #define vreinterpretq_u32_u16(a) (a) #define vreinterpretq_u32_u64(a) (a) #define vreinterpretq_u64_f32(a) (a) #define vreinterpretq_u64_s8(a) (a) #define vreinterpretq_u64_s16(a) (a) #define vreinterpretq_u64_s32(a) (a) #define vreinterpretq_u64_s64(a) (a) #define vreinterpretq_u64_p8(a) (a) #define vreinterpretq_u64_p16(a) (a) #define vreinterpretq_u64_u8(a) (a) #define vreinterpretq_u64_u16(a) (a) #define vreinterpretq_u64_u32(a) (a) #define vreinterpret_f16_u16(a) (a) #define vreinterpret_u16_f16(a) (a) #if defined (__cplusplus) } #endif /* defined (__cplusplus) */ /////////////////////////////////////////////////////////////////////////////// // // VLDx/VSTx alignment specifications // #define _NEON_ALIGN16(a) \ ( \ ((a) == 8) ? 0 : \ ((a) == 16) ? 1 : \ -1) #define _NEON_ALIGN32(a) \ ( \ ((a) == 8) ? 0 : \ ((a) == 32) ? 1 : \ -1) #define _NEON_ALIGN64(a) \ ( \ ((a) == 8) ? 0 : \ ((a) == 64) ? 1 : \ -1) #define _NEON_ALIGN64_128(a) \ ( \ ((a) == 8) ? 0 : \ ((a) == 64) ? 1 : \ ((a) == 128) ? 2 : \ -1) #define _NEON_ALIGN64_128_256(a) \ ( \ ((a) == 8) ? 0 : \ ((a) == 64) ? 1 : \ ((a) == 128) ? 2 : \ ((a) == 256) ? 3 : \ -1) /* 88bf0570-3001-4e78-a5f2-be5765546192 */