您当前的位置:首页 > IT编程 > C++
| C语言 | Java | VB | VC | python | Android | TensorFlow | C++ | oracle | 学术与代码 | cnn卷积神经网络 | gnn | 图像修复 | Keras | 数据集 | Neo4j | 自然语言处理 | 深度学习 | 医学CAD | 医学影像 | 超参数 | pointnet | pytorch | 异常检测 | Transformers | 情感分类 | 知识图谱 |

自学教程:C++ vmulq_f32函数代码示例

51自学网 2021-06-03 09:46:19
  C++
这篇教程C++ vmulq_f32函数代码示例写得很实用,希望能帮到您。

本文整理汇总了C++中vmulq_f32函数的典型用法代码示例。如果您正苦于以下问题:C++ vmulq_f32函数的具体用法?C++ vmulq_f32怎么用?C++ vmulq_f32使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了vmulq_f32函数的29个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: AudioBlockPanStereoToStereo_NEON

void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],                                      const float aInputR[WEBAUDIO_BLOCK_SIZE],                                      float aGainL, float aGainR,                                      bool aIsOnTheLeft,                                      float aOutputL[WEBAUDIO_BLOCK_SIZE],                                      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {  ASSERT_ALIGNED(aInputL);  ASSERT_ALIGNED(aInputR);  ASSERT_ALIGNED(aOutputL);  ASSERT_ALIGNED(aOutputR);  float32x4_t vinL0, vinL1;  float32x4_t vinR0, vinR1;  float32x4_t voutL0, voutL1;  float32x4_t voutR0, voutR1;  float32x4_t vscaleL = vmovq_n_f32(aGainL);  float32x4_t vscaleR = vmovq_n_f32(aGainR);  if (aIsOnTheLeft) {    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));      voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);      voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);      voutR0 = vmulq_f32(vinR0, vscaleR);      voutR1 = vmulq_f32(vinR1, vscaleR);      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);    }  } else {    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));      voutL0 = vmulq_f32(vinL0, vscaleL);      voutL1 = vmulq_f32(vinL1, vscaleL);      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);      voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);      voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);    }  }}
开发者ID:jasonLaster,项目名称:gecko-dev,代码行数:60,


示例2: AudioBlockCopyChannelWithScale_NEON

voidAudioBlockCopyChannelWithScale_NEON(const float aInput[WEBAUDIO_BLOCK_SIZE],                                    const float aScale[WEBAUDIO_BLOCK_SIZE],                                    float aOutput[WEBAUDIO_BLOCK_SIZE]){  ASSERT_ALIGNED(aInput);  ASSERT_ALIGNED(aScale);  ASSERT_ALIGNED(aOutput);  float32x4_t vin0, vin1, vin2, vin3;  float32x4_t vout0, vout1, vout2, vout3;  float32x4_t vscale0, vscale1, vscale2, vscale3;  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=16) {    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));    vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4));    vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8));    vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12));    vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));    vscale1 = vld1q_f32(ADDRESS_OF(aScale, i+4));    vscale2 = vld1q_f32(ADDRESS_OF(aScale, i+8));    vscale3 = vld1q_f32(ADDRESS_OF(aScale, i+12));    vout0 = vmulq_f32(vin0, vscale0);    vout1 = vmulq_f32(vin1, vscale1);    vout2 = vmulq_f32(vin2, vscale2);    vout3 = vmulq_f32(vin3, vscale3);    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);    vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1);    vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2);    vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3);  }}
开发者ID:Incognito,项目名称:mozilla-central,代码行数:35,


示例3: vsqrtq_f32

static float32x4_t vsqrtq_f32(float32x4_t s) {  int i;  float32x4_t x = vrsqrteq_f32(s);  // Code to handle sqrt(0).  // If the input to sqrtf() is zero, a zero will be returned.  // If the input to vrsqrteq_f32() is zero, positive infinity is returned.  const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);  // check for divide by zero  const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x));  // zero out the positive infinity results  x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero),                                      vreinterpretq_u32_f32(x)));  // from arm documentation  // The Newton-Raphson iteration:  //     x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2)  // converges to (1/√d) if x0 is the result of VRSQRTE applied to d.  //  // Note: The precision did not improve after 2 iterations.  for (i = 0; i < 2; i++) {    x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x);  }  // sqrt(s) = s * 1/sqrt(s)  return vmulq_f32(s, x);;}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:25,


示例4: cv_vrecpq_f32

inline float32x4_t cv_vrecpq_f32(float32x4_t val){    float32x4_t reciprocal = vrecpeq_f32(val);    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);    return reciprocal;}
开发者ID:EmergentVR,项目名称:opencv_win312,代码行数:7,


示例5: AudioBufferInPlaceScale_NEON

void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {  ASSERT_ALIGNED(aBlock);  float32x4_t vin0, vin1, vin2, vin3;  float32x4_t vout0, vout1, vout2, vout3;  float32x4_t vscale = vmovq_n_f32(aScale);  uint32_t dif = aSize % 16;  uint32_t vectorSize = aSize - dif;  uint32_t i = 0;  for (; i < vectorSize; i += 16) {    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));    vout0 = vmulq_f32(vin0, vscale);    vout1 = vmulq_f32(vin1, vscale);    vout2 = vmulq_f32(vin2, vscale);    vout3 = vmulq_f32(vin3, vscale);    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);  }  for (unsigned j = 0; j < dif; ++i, ++j) {    aBlock[i] *= aScale;  }}
开发者ID:jasonLaster,项目名称:gecko-dev,代码行数:31,


示例6: vaddq_f32

    //-----------------------------------------------------------------------------------    void MathlibNEON::SinCos4( ArrayReal x, ArrayReal &outSin, ArrayReal &outCos )    {        // TODO: Improve accuracy by mapping to the range [-pi/4, pi/4] and swap        // between cos & sin depending on which quadrant it fell:        // Quadrant | sin     |  cos        // n = 0 ->  sin( x ),  cos( x )        // n = 1 ->  cos( x ), -sin( x )        // n = 2 -> -sin( x ), -cos( x )        // n = 3 -> -sin( x ),  sin( x )        // See ARGUMENT REDUCTION FOR HUGE ARGUMENTS:        // Good to the Last Bit        // K. C. Ng and themembers of the FP group of SunPro        // http://www.derekroconnor.net/Software/Ng--ArgReduction.pdf        // -- Perhaps we can leave this to GSoC students? --        // Map arbitrary angle x to the range [-pi; +pi] without using division.        // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON)        // can replace the add, the sub, & the two muls for two mad        ArrayReal integralPart;        x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF );        x = Modf4( x, integralPart );        x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI );        sincos_ps( x, &outSin, &outCos );    }
开发者ID:amerkoleci,项目名称:mogre,代码行数:27,


示例7: cv_vrsqrtq_f32

inline float32x4_t cv_vrsqrtq_f32(float32x4_t val){    float32x4_t e = vrsqrteq_f32(val);    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);    return e;}
开发者ID:EmergentVR,项目名称:opencv_win312,代码行数:7,


示例8: ScaleErrorSignalNEON

static void ScaleErrorSignalNEON(int extended_filter_enabled,                                 float normal_mu,                                 float normal_error_threshold,                                 float x_pow[PART_LEN1],                                 float ef[2][PART_LEN1]) {  const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;  const float error_threshold = extended_filter_enabled ?      kExtendedErrorThreshold : normal_error_threshold;  const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);  const float32x4_t kMu = vmovq_n_f32(mu);  const float32x4_t kThresh = vmovq_n_f32(error_threshold);  int i;  // vectorized code (four at once)  for (i = 0; i + 3 < PART_LEN1; i += 4) {    const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]);    const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);    const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);    const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f);    float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);    float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);    const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);    const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);    const float32x4_t absEf = vsqrtq_f32(ef_sum2);    const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);    const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);    const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);    uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));    uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));    uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger),                                     vreinterpretq_u32_f32(ef_re));    uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger),                                     vreinterpretq_u32_f32(ef_im));    ef_re_if = vandq_u32(bigger, ef_re_if);    ef_im_if = vandq_u32(bigger, ef_im_if);    ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);    ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);    ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);    ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);    vst1q_f32(&ef[0][i], ef_re);    vst1q_f32(&ef[1][i], ef_im);  }  // scalar code for the remaining items.  for (; i < PART_LEN1; i++) {    float abs_ef;    ef[0][i] /= (x_pow[i] + 1e-10f);    ef[1][i] /= (x_pow[i] + 1e-10f);    abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);    if (abs_ef > error_threshold) {      abs_ef = error_threshold / (abs_ef + 1e-10f);      ef[0][i] *= abs_ef;      ef[1][i] *= abs_ef;    }    // Stepsize factor    ef[0][i] *= mu;    ef[1][i] *= mu;  }}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:59,


示例9: qcms_transform_data_rgba_out_lut_neon

void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform,                                           unsigned char *src,                                           unsigned char *dest,                                           size_t length){  size_t i;  unsigned char alpha;  float32_t (*mat)[4] = transform->matrix;  const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r;  const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g;  const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b;  const uint8_t *otdata_r = &transform->output_table_r->data[0];  const uint8_t *otdata_g = &transform->output_table_g->data[0];  const uint8_t *otdata_b = &transform->output_table_b->data[0];  const float32x4_t mat0 = vld1q_f32(mat[0]);  const float32x4_t mat1 = vld1q_f32(mat[1]);  const float32x4_t mat2 = vld1q_f32(mat[2]);  const float32x4_t max   = vld1q_dup_f32(&clampMaxValue);  const float32x4_t min   = vld1q_dup_f32(&zero);  const float32x4_t scale = vld1q_dup_f32(&floatScale);  float32x4_t vec_r, vec_g, vec_b;  int32x4_t result;  /* CYA */  if (!length)    return;  for (i = 0; i < length; i++) {    /* setup for transforming the pixel */    vec_r = vld1q_dup_f32(&igtbl_r[*src++]);    vec_g = vld1q_dup_f32(&igtbl_g[*src++]);    vec_b = vld1q_dup_f32(&igtbl_b[*src++]);    alpha = *src++;    /* gamma * matrix */    vec_r = vmulq_f32(vec_r, mat0);    vec_g = vmulq_f32(vec_g, mat1);    vec_b = vmulq_f32(vec_b, mat2);    /* crunch, crunch, crunch */    vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b));    vec_r = vmaxq_f32(min, vec_r);    vec_r = vminq_f32(max, vec_r);    result = vcvtq_s32_f32(vmulq_f32(vec_r, scale));    /* use calc'd indices to output RGB values */    *dest++ = otdata_r[vgetq_lane_s32(result, 0)];    *dest++ = otdata_g[vgetq_lane_s32(result, 1)];    *dest++ = otdata_b[vgetq_lane_s32(result, 2)];    *dest++ = alpha;  }}
开发者ID:makotokato,项目名称:qcms,代码行数:57,


示例10: neon_make_rgb

static inline void neon_make_rgb(float32x4_t macropixel, float32x4_t *rgba0p,				 float32x4_t  *rgba1p){  const float32x4_t  u_coeff = {0.0, -0.34455,  1.7790, 0.0 };  const float32x4_t  v_coeff = {1.4075, -0.7169, 0.0, 0.0 };  float32x4_t  y0_vec, y1_vec, u_vec, v_vec,  uv_vec;  float32x2_t y0_u, y1_v;  const float32_t alpha = 255.0;    /* macropixel is [Y0, U, Y1, V].   */  /* since vdupq_lane_f32 will only take two element vectors we */  /* need to pick macropixel apart to build vectors of the components.  */  /* so make y0_u be the first half of macropixel [Y0, U] and  */  /* y1_v be the second half [Y1, V]. */   y0_u =  vget_low_f32(macropixel);  y1_v =  vget_high_f32(macropixel);  /* now copy Y0 to all elements of y0_vec, then overwrite element 3  */  /* with alpha.   */  y0_vec = vdupq_lane_f32(y0_u, 0);  y0_vec =  vsetq_lane_f32(alpha, y0_vec, 3);  /* make u_vec be [U, U, U, U]. we'll do that using  */  /* vdupq_lane_f32 and selecting U (element 1) from y0_u  */    u_vec  = vdupq_lane_f32(y0_u, 1);  /* now copy Y1 to all elements of y1_vec, then overwrite element 3  */  /* with alpha.   */    y1_vec  = vdupq_lane_f32(y1_v, 0);  y1_vec =  vsetq_lane_f32(alpha, y1_vec, 3);  /* make v_vec be [V, V, V, V]. we'll do that using  */  /* vdupq_lane_f32 and selecting V (element 1) from y1_v  */    v_vec = vdupq_lane_f32(y1_v, 1);  /* now multiply u_vec * u_coeff and v_vec by v_coeff.  */  u_vec =  vmulq_f32(u_vec, u_coeff);  v_vec =  vmulq_f32(v_vec, v_coeff);  /* add u_vec and v_vec to form uv_vec. use that to build  */  /*  rgba0 and  rgba1 by adding y0_vec, y1_vec*/    uv_vec = vaddq_f32(u_vec, v_vec);  *rgba0p =  vaddq_f32(y0_vec, uv_vec);  *rgba1p =  vaddq_f32(y1_vec, uv_vec);    }
开发者ID:yaojingguo,项目名称:gcc-intrinsics-samplecode,代码行数:54,


示例11: mw_neon_mm_mul_f32x4

/* f32x4 mm mul */void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C){	int i, k, j;	float32x4_t neon_b, neon_c;	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;	for (i = 0; i < Row; i+=4)	{		for (k = 0; k < Col; k+=1)		{			neon_c = vmovq_n_f32(0);			for (j = 0; j < T; j+=4)			{				int j_T = j * T + i;				int k_Row = k * Row;				neon_a0 = vld1q_f32(A + j_T);				j_T+=Row;				neon_a1 = vld1q_f32(A + j_T);				j_T+=Row;				neon_a2 = vld1q_f32(A + j_T);				j_T+=Row;				neon_a3 = vld1q_f32(A + j_T);				neon_b = vld1q_f32(B + k_Row + j);				neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));				neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));				neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));				neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));				neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);				neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);				neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);				neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);				vst1q_lane_f32(C + k_Row + i, neon_c, 0);				vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1);				vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2);				vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3);			}		}	}}
开发者ID:Daniel-Hwang,项目名称:eeg,代码行数:50,


示例12: vdivq_f32

static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) {  int i;  float32x4_t x = vrecpeq_f32(b);  // from arm documentation  // The Newton-Raphson iteration:  //     x[n+1] = x[n] * (2 - d * x[n])  // converges to (1/d) if x0 is the result of VRECPE applied to d.  //  // Note: The precision did not improve after 2 iterations.  for (i = 0; i < 2; i++) {    x = vmulq_f32(vrecpsq_f32(b, x), x);  }  // a/b = a*(1/b)  return vmulq_f32(a, x);}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:15,


示例13: cft1st_128_neon

static void cft1st_128_neon(float* a) {  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);  int j, k2;  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {    float32x4_t a00v = vld1q_f32(&a[j + 0]);    float32x4_t a04v = vld1q_f32(&a[j + 4]);    float32x4_t a08v = vld1q_f32(&a[j + 8]);    float32x4_t a12v = vld1q_f32(&a[j + 12]);    float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));    float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));    float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));    float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));    const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);    float32x4_t x0v = vaddq_f32(a01v, a23v);    const float32x4_t x1v = vsubq_f32(a01v, a23v);    const float32x4_t x2v = vaddq_f32(a45v, a67v);    const float32x4_t x3v = vsubq_f32(a45v, a67v);    const float32x4_t x3w = vrev64q_f32(x3v);    float32x4_t x0w;    a01v = vaddq_f32(x0v, x2v);    x0v = vsubq_f32(x0v, x2v);    x0w = vrev64q_f32(x0v);    a45v = vmulq_f32(wk2rv, x0v);    a45v = vmlaq_f32(a45v, wk2iv, x0w);    x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);    x0w = vrev64q_f32(x0v);    a23v = vmulq_f32(wk1rv, x0v);    a23v = vmlaq_f32(a23v, wk1iv, x0w);    x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);    x0w = vrev64q_f32(x0v);    a67v = vmulq_f32(wk3rv, x0v);    a67v = vmlaq_f32(a67v, wk3iv, x0w);    a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));    a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));    a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));    a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));    vst1q_f32(&a[j + 0], a00v);    vst1q_f32(&a[j + 4], a04v);    vst1q_f32(&a[j + 8], a08v);    vst1q_f32(&a[j + 12], a12v);  }}
开发者ID:dreaflove,项目名称:webrtc-lib,代码行数:48,


示例14: test_fma

static void test_fma() {    for(int i=0; i<1020 * 4; i++) {        data_f[i] = i;    }    float32x4_t c0_02 = vdupq_n_f32(0.02f);    float32x4_t c0_04 = vdupq_n_f32(0.04f);    float32x4_t c0_05 = vdupq_n_f32(0.05f);    float32x4_t c0_10 = vdupq_n_f32(0.1f);    float32x4_t c0_20 = vdupq_n_f32(0.2f);    float32x4_t c1_00 = vdupq_n_f32(1.0f);    startTime();    // Do ~1 billion ops    for (int ct=0; ct < (1000 * (1000 / 80)); ct++) {        for (int i=0; i < 1000; i++) {            float32x4_t t;            t = vmulq_f32(vld1q_f32((float32_t *)&data_f[i]), c0_02);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+4]), c0_04);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+8]), c0_05);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+12]), c0_10);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+16]), c0_20);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+20]), c0_20);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+24]), c0_10);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+28]), c0_05);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+32]), c0_04);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+36]), c0_02);            t = vaddq_f32(t, c1_00);            vst1q_f32((float32_t *)&data_f[i], t);        }    }    endTime("neon fma", 1e9);}
开发者ID:Khaon,项目名称:android_system_extras,代码行数:34,


示例15: SquaredDifferenceSum16f

        template <bool align> SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum)        {            assert(size >= F);            if (align)                assert(Aligned(a) && Aligned(b));            size_t partialAlignedSize = AlignLo(size, F);            size_t fullAlignedSize = AlignLo(size, DF);            size_t i = 0;            float32x4_t sums[2] = { vdupq_n_f32(0), vdupq_n_f32(0) };            if (fullAlignedSize)            {                for (; i < fullAlignedSize; i += DF)                {                    SquaredDifferenceSum16f<align>(a, b, i + F * 0, sums[0]);                    SquaredDifferenceSum16f<align>(a, b, i + F * 1, sums[1]);                }                sums[0] = vaddq_f32(sums[0], sums[1]);            }            for (; i < partialAlignedSize; i += F)                SquaredDifferenceSum16f<align>(a, b, i, sums[0]);            if (partialAlignedSize != size)            {                float32x4_t tailMask = RightNotZero(size - partialAlignedSize);                float32x4_t _a = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F));                float32x4_t _b = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F));                float32x4_t _d = And(vsubq_f32(_a, _b), tailMask);                sums[0] = vaddq_f32(sums[0], vmulq_f32(_d, _d));            }            *sum = ExtractSum32f(sums[0]);        }
开发者ID:flying19880517,项目名称:AntiDupl,代码行数:31,


示例16: FilterFarNEON

static void FilterFarNEON(    int num_partitions,    int x_fft_buf_block_pos,    float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],    float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],    float y_fft[2][PART_LEN1]) {  int i;  for (i = 0; i < num_partitions; i++) {    int j;    int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;    int pos = i * PART_LEN1;    // Check for wrap    if (i + x_fft_buf_block_pos >= num_partitions) {      xPos -= num_partitions * PART_LEN1;    }    // vectorized code (four at once)    for (j = 0; j + 3 < PART_LEN1; j += 4) {      const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);      const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);      const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]);      const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]);      const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]);      const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]);      const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re);      const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im);      const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im);      const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re);      const float32x4_t g = vaddq_f32(y_fft_re, e);      const float32x4_t h = vaddq_f32(y_fft_im, f);      vst1q_f32(&y_fft[0][j], g);      vst1q_f32(&y_fft[1][j], h);    }    // scalar code for the remaining items.    for (; j < PART_LEN1; j++) {      y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j],                           x_fft_buf[1][xPos + j],                           h_fft_buf[0][pos + j],                           h_fft_buf[1][pos + j]);      y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],                           x_fft_buf[1][xPos + j],                           h_fft_buf[0][pos + j],                           h_fft_buf[1][pos + j]);    }  }}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:46,


示例17: test_vmulQf32

void test_vmulQf32 (void){  float32x4_t out_float32x4_t;  float32x4_t arg0_float32x4_t;  float32x4_t arg1_float32x4_t;  out_float32x4_t = vmulq_f32 (arg0_float32x4_t, arg1_float32x4_t);}
开发者ID:Akheon23,项目名称:chromecast-mirrored-source.toolchain,代码行数:8,


示例18: SubbandCoherenceNEON

static void SubbandCoherenceNEON(AecCore* aec,                                 float efw[2][PART_LEN1],                                 float dfw[2][PART_LEN1],                                 float xfw[2][PART_LEN1],                                 float* fft,                                 float* cohde,                                 float* cohxd,                                 int* extreme_filter_divergence) {  int i;  SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence);  {    const float32x4_t vec_1eminus10 =  vdupq_n_f32(1e-10f);    // Subband coherence    for (i = 0; i + 3 < PART_LEN1; i += 4) {      const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);      const float32x4_t vec_se = vld1q_f32(&aec->se[i]);      const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);      const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);      const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);      float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);      float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);      vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);      vec_cohde = vdivq_f32(vec_cohde, vec_sdse);      vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);      vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);      vst1q_f32(&cohde[i], vec_cohde);      vst1q_f32(&cohxd[i], vec_cohxd);    }  }  // scalar code for the remaining items.  for (; i < PART_LEN1; i++) {    cohde[i] =        (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /        (aec->sd[i] * aec->se[i] + 1e-10f);    cohxd[i] =        (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /        (aec->sx[i] * aec->sd[i] + 1e-10f);  }}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:45,


示例19: neon_coord_4

/* Performs one rotation/translation */static void neon_coord_4(    float32x4_t a_4,     float32x4_t b_4,    float32x4_t x_4,     float32x4_t y_4,    float32x4_t pos_4f,     float32x4_t point5_4,     int * result){    float32x4_t tmp1 = vmulq_f32(a_4, x_4);    float32x4_t tmp2 = vmulq_f32(b_4, y_4);    tmp2 = vaddq_f32(tmp1, tmp2);    tmp2 = vaddq_f32(tmp2, pos_4f);    tmp2 = vaddq_f32(tmp2, point5_4);    int32x4_t c_4 = vcvtq_s32_f32(tmp2);    vst1q_s32(result, c_4);}
开发者ID:mittalrajat,项目名称:Robotics-Systems-Lab,代码行数:19,


示例20: WindowDataNEON

// Window time domain data to be used by the fft.static void WindowDataNEON(float* x_windowed, const float* x) {  int i;  for (i = 0; i < PART_LEN; i += 4) {    const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);    const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);    const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);    // A B C D    float32x4_t vec_sqrtHanning_rev =        vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);    // B A D C    vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);    // D C B A    vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),                                       vget_low_f32(vec_sqrtHanning_rev));    vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));    vst1q_f32(&x_windowed[PART_LEN + i],            vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));  }}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:20,


示例21: SquaredDifferenceKahanSum32f

 template <bool align> SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t offset, float32x4_t & sum, float32x4_t & correction) {     float32x4_t _a = Load<align>(a + offset);     float32x4_t _b = Load<align>(b + offset);     float32x4_t _d = vsubq_f32(_a, _b);     float32x4_t term = vmlaq_f32(correction, _d, _d);     float32x4_t temp = vaddq_f32(sum, term);     correction = vsubq_f32(vmulq_f32(temp, sum), term);     sum = temp; }
开发者ID:flying19880517,项目名称:AntiDupl,代码行数:10,


示例22: HogDirectionHistograms

        template <bool align> SIMD_INLINE void HogDirectionHistograms(const float32x4_t & dx, const float32x4_t & dy, Buffer & buffer, size_t col)        {            float32x4_t bestDot = vdupq_n_f32(0);            int32x4_t bestIndex = vdupq_n_s32(0);            for(int i = 0; i < buffer.size; ++i)            {                float32x4_t dot = vaddq_f32(vmulq_f32(dx, buffer.cos[i]), vmulq_f32(dy, buffer.sin[i]));                uint32x4_t mask = vcgtq_f32(dot, bestDot);                bestDot = vmaxq_f32(dot, bestDot);                bestIndex = vbslq_s32(mask, buffer.pos[i], bestIndex);                dot = vnegq_f32(dot);                mask = vcgtq_f32(dot, bestDot);                bestDot = vmaxq_f32(dot, bestDot);                bestIndex = vbslq_s32(mask, buffer.neg[i], bestIndex);            }            Store<align>(buffer.index + col, bestIndex);            Store<align>(buffer.value + col, Sqrt<SIMD_NEON_RCP_ITER>(vaddq_f32(vmulq_f32(dx, dx), vmulq_f32(dy, dy))));        }
开发者ID:4144,项目名称:Simd,代码行数:19,


示例23: FilterFarNEON

static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) {  int i;  const int num_partitions = aec->num_partitions;  for (i = 0; i < num_partitions; i++) {    int j;    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;    int pos = i * PART_LEN1;    // Check for wrap    if (i + aec->xfBufBlockPos >= num_partitions) {      xPos -= num_partitions * PART_LEN1;    }    // vectorized code (four at once)    for (j = 0; j + 3 < PART_LEN1; j += 4) {      const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);      const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);      const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);      const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);      const float32x4_t yf_re = vld1q_f32(&yf[0][j]);      const float32x4_t yf_im = vld1q_f32(&yf[1][j]);      const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re);      const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im);      const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im);      const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re);      const float32x4_t g = vaddq_f32(yf_re, e);      const float32x4_t h = vaddq_f32(yf_im, f);      vst1q_f32(&yf[0][j], g);      vst1q_f32(&yf[1][j], h);    }    // scalar code for the remaining items.    for (; j < PART_LEN1; j++) {      yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],                        aec->xfBuf[1][xPos + j],                        aec->wfBuf[0][pos + j],                        aec->wfBuf[1][pos + j]);      yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],                        aec->xfBuf[1][xPos + j],                        aec->wfBuf[0][pos + j],                        aec->wfBuf[1][pos + j]);    }  }}
开发者ID:RyanLee27,项目名称:pjproject,代码行数:42,


示例24: dotProd_neon

void dotProd_neon(const float *data, const float *weights, float *vals, const int n, const int len, const float *istd) {    for (int i = 0; i < n; i += 4) {        float32x4_t accum0 = { 0.0f, 0.0f, 0.0f, 0.0f };        float32x4_t accum1 = accum0;        float32x4_t accum2 = accum0;        float32x4_t accum3 = accum0;        for (int j = 0; j < len; j += 4) {            float32x4_t d0 = vld1q_f32(data + j);            float32x4_t d1 = d0;            float32x4_t d2 = d0;            float32x4_t d3 = d0;            float32x4_t w0 = vld1q_f32(weights);            float32x4_t w1 = vld1q_f32(weights + 4);            float32x4_t w2 = vld1q_f32(weights + 8);            float32x4_t w3 = vld1q_f32(weights + 12);            accum0 = vaddq_f32(accum0, vmulq_f32(d0, w0));            accum1 = vaddq_f32(accum1, vmulq_f32(d1, w1));            accum2 = vaddq_f32(accum2, vmulq_f32(d2, w2));            accum3 = vaddq_f32(accum3, vmulq_f32(d3, w3));            weights += 16;        }        float32x2_t sum0 = vpadd_f32(vget_low_f32(accum0), vget_high_f32(accum0));        float32x2_t sum1 = vpadd_f32(vget_low_f32(accum1), vget_high_f32(accum1));        float32x2_t sum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));        float32x2_t sum3 = vpadd_f32(vget_low_f32(accum3), vget_high_f32(accum3));        sum0 = vpadd_f32(sum0, sum1);        sum1 = vpadd_f32(sum2, sum3);        float32x4_t sum = vcombine_f32(sum0, sum1);                sum = vmulq_n_f32(sum, istd[0]);        sum = vaddq_f32(sum, vld1q_f32(weights + n*len + i));        vst1q_f32(vals + i, sum);    }}
开发者ID:IFeelBloated,项目名称:vapoursynth-nnedi3,代码行数:39,


示例25: mw_neon_mv_mul_f32x4

/* f32x4 mv mul */void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C){	int i = 0;	int k = 0;	float32x4_t neon_b, neon_c;	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;	for (i = 0; i < Row; i+=4)	{		neon_c = vmovq_n_f32(0);		for (k = 0; k < T; k+=4)		{			int j = k * T + i;			neon_a0 = vld1q_f32(A + j);			neon_a1 = vld1q_f32(A + j + Row);			neon_a2 = vld1q_f32(A + j + 2 * Row);			neon_a3 = vld1q_f32(A + j + 3 * Row);			neon_b = vld1q_f32(B + k);			neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));			neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));			neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));			neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));			neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);			neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);			neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);			neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);		}		vst1q_f32(C + i, neon_c);	}}
开发者ID:Daniel-Hwang,项目名称:eeg,代码行数:39,


示例26: dotProd_i16_neon

void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) {    const int16_t *data = (const int16_t *)dataf;    const int16_t *weights = (const int16_t *)weightsf;    weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t)    for (int i = 0; i < n; i += 4) {        int32x4_t accum0 = { 0, 0, 0, 0 };        int32x4_t accum1 = accum0;        int32x4_t accum2 = accum0;        int32x4_t accum3 = accum0;        for (int j = 0; j < len; j += 8) {            int16x4x2_t d0 = vld2_s16(data + j);            int16x4x2_t w0 = vld2_s16(weights);            int16x4x2_t w1 = vld2_s16(weights + 8);            int16x4x2_t w2 = vld2_s16(weights + 16);            int16x4x2_t w3 = vld2_s16(weights + 24);            accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]);            accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]);            accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]);            accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]);            accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]);            accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]);            accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]);            accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]);            weights += 32;        }        int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0));        int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1));        int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));        int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3));        sum0 = vpadd_s32(sum0, sum1);        sum1 = vpadd_s32(sum2, sum3);        int32x4_t sum = vcombine_s32(sum0, sum1);        float32x4_t val = vcvtq_f32_s32(sum);        val = vmulq_f32(val, vld1q_f32(weightsf + i*2));        val = vmulq_n_f32(val, istd[0]);        val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4));        vst1q_f32(vals + i, val);    }}
开发者ID:IFeelBloated,项目名称:vapoursynth-nnedi3,代码行数:49,


示例27: neon_vector_mul

static void neon_vector_mul(const std::vector<float>& vec_a, const std::vector<float>& vec_b, std::vector<float>& vec_result){	assert(vec_a.size() == vec_b.size());	assert(vec_a.size() == vec_result.size());	int i = 0;	//neon process	for (; i < (int)vec_result.size() - 3 ; i+=4)	{		const auto data_a = vld1q_f32(&vec_a[i]);		const auto data_b = vld1q_f32(&vec_b[i]);		float* dst_ptr = &vec_result[i];		const auto data_res = vmulq_f32(data_a, data_b);		vst1q_f32(dst_ptr, data_res);	}	//normal process	for (; i < (int)vec_result.size(); i++)	{		vec_result[i] = vec_a[i] * vec_b[i];	}}
开发者ID:xylcbd,项目名称:blogs_code,代码行数:20,


示例28: saxpy_vector

//Kernel function: saxpyvoid saxpy_vector(KernelArgs* args) {    //Setup    const float32x4_t MASK_FALSE = vdupq_n_f32(0.f);    const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE));        //Uniforms        //Fuses        //Literals        //Stack variables    float32x4_t scale, x, y, result, var060, var061;        //Loop over input    uint64_t index;    for(index = 0; index < args->N; index += 4) {            //Inputs        scale = vld1q_f32(&args->scale[index]);        x = vld1q_f32(&args->x[index]);        y = vld1q_f32(&args->y[index]);                //Begin kernel logic        {                    //>>> result = scale * x + y            var061 = vmulq_f32(scale, x);            var060 = vaddq_f32(var061, y);            result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result);                }        //End kernel logic                //Outputs        vst1q_f32(&args->result[index], result);            }}
开发者ID:yongshanding,项目名称:vecAndroid,代码行数:41,


示例29: neon_VecdotVec

/** * @brief   vector_dot_vector. * * @param   dst[out]     the output element(1*1) * @param   src1[in]     the input  vector(1*n) *          src2[in]     the input  vector(1*n) *          dimN[in]     size of vector * * @return  void */void neon_VecdotVec(float *dst,                    const float *src1,                    const float *src2,                    const int dimN){    float *mat0 = (float *)src1;    float *mat1 = (float *)src2;    float32x4_t q0 = vld1q_f32(mat0);    float32x4_t q1 = vld1q_f32(mat1);    q0 = vmulq_f32(q0, q1);    int j = 4;    for (; j <= dimN - 4; j += 4)    {        float32x4_t q2 = vld1q_f32(mat0 + j);        float32x4_t q3 = vld1q_f32(mat1 + j);        q0 = vmlaq_f32(q0, q2, q3);    }    float32x2_t d0 = vpadd_f32(vget_low_f32(q0), vget_high_f32(q0));    d0 = vpadd_f32(d0, d0);    *dst = *((float *)&d0);    for (; j < dimN; j++) {        *dst += src1[j] * src2[j];    }}
开发者ID:shenjiang11,项目名称:iOS,代码行数:34,



注:本文中的vmulq_f32函数示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


C++ vmw_priv函数代码示例
C++ vmtruncate函数代码示例
万事OK自学网:51自学网_软件自学网_CAD自学网自学excel、自学PS、自学CAD、自学C语言、自学css3实例,是一个通过网络自主学习工作技能的自学平台,网友喜欢的软件自学网站。