您当前的位置:首页 > IT编程 > C++
| C语言 | Java | VB | VC | python | Android | TensorFlow | C++ | oracle | 学术与代码 | cnn卷积神经网络 | gnn | 图像修复 | Keras | 数据集 | Neo4j | 自然语言处理 | 深度学习 | 医学CAD | 医学影像 | 超参数 | pointnet | pytorch | 异常检测 | Transformers | 情感分类 | 知识图谱 |

自学教程:C++ vld1q_f32函数代码示例

51自学网 2021-06-03 09:44:56
  C++
这篇教程C++ vld1q_f32函数代码示例写得很实用,希望能帮到您。

本文整理汇总了C++中vld1q_f32函数的典型用法代码示例。如果您正苦于以下问题:C++ vld1q_f32函数的具体用法?C++ vld1q_f32怎么用?C++ vld1q_f32使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了vld1q_f32函数的30个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: test_fma

static void test_fma() {    for(int i=0; i<1020 * 4; i++) {        data_f[i] = i;    }    float32x4_t c0_02 = vdupq_n_f32(0.02f);    float32x4_t c0_04 = vdupq_n_f32(0.04f);    float32x4_t c0_05 = vdupq_n_f32(0.05f);    float32x4_t c0_10 = vdupq_n_f32(0.1f);    float32x4_t c0_20 = vdupq_n_f32(0.2f);    float32x4_t c1_00 = vdupq_n_f32(1.0f);    startTime();    // Do ~1 billion ops    for (int ct=0; ct < (1000 * (1000 / 80)); ct++) {        for (int i=0; i < 1000; i++) {            float32x4_t t;            t = vmulq_f32(vld1q_f32((float32_t *)&data_f[i]), c0_02);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+4]), c0_04);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+8]), c0_05);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+12]), c0_10);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+16]), c0_20);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+20]), c0_20);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+24]), c0_10);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+28]), c0_05);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+32]), c0_04);            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+36]), c0_02);            t = vaddq_f32(t, c1_00);            vst1q_f32((float32_t *)&data_f[i], t);        }    }    endTime("neon fma", 1e9);}
开发者ID:Khaon,项目名称:android_system_extras,代码行数:34,


示例2: AudioBufferInPlaceScale_NEON

void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {  ASSERT_ALIGNED(aBlock);  float32x4_t vin0, vin1, vin2, vin3;  float32x4_t vout0, vout1, vout2, vout3;  float32x4_t vscale = vmovq_n_f32(aScale);  uint32_t dif = aSize % 16;  uint32_t vectorSize = aSize - dif;  uint32_t i = 0;  for (; i < vectorSize; i += 16) {    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));    vout0 = vmulq_f32(vin0, vscale);    vout1 = vmulq_f32(vin1, vscale);    vout2 = vmulq_f32(vin2, vscale);    vout3 = vmulq_f32(vin3, vscale);    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);  }  for (unsigned j = 0; j < dif; ++i, ++j) {    aBlock[i] *= aScale;  }}
开发者ID:jasonLaster,项目名称:gecko-dev,代码行数:31,


示例3: scaled_sumf_thread_NEON

void * scaled_sumf_thread_NEON(void * argument){    jsize i = 0;    struct scaled_sumfneon_thread_data * data = (struct scaled_sumfneon_thread_data *) argument;    float32_t * r = (float32_t *)data->r;    const float32_t * x = (const float32_t *)data->x;    const float32_t * y = (const float32_t *)data->y;    const float32_t a = (const float32_t)data->a;    const jsize size = data->size;    float32x4_t rx4, xx4, yx4, ax4;    ax4 = vdupq_n_f32(a);    for(i; i < size ; i += 4)    {        xx4 = vld1q_f32(&(x[i]));        yx4 = vld1q_f32(&(y[i]));        rx4 = vmlaq_f32(xx4, ax4, yx4);        vst1q_f32(&(r[i]), rx4);    }}
开发者ID:dribbroc,项目名称:HONEI-Droid,代码行数:25,


示例4: neon_scale

/** * @brief   vector scale: A[] = alpha * B[]. * * @param   dst[out]    the result matrix A. *          src[in]     the input matrix B. *          alpha[in]   scale of B. *          elemCnt[in] number of elements to calc. * * @return  void. */void neon_scale(float *dst,                const float *src,                const float alpha,                const int elemCnt){    int i;    for (i = 0; i <= elemCnt - 16; i += 16)    {        float32x4_t q0 = vld1q_f32(src + i);        float32x4_t q1 = vld1q_f32(src + i + 4);        float32x4_t q2 = vld1q_f32(src + i + 8);        float32x4_t q3 = vld1q_f32(src + i + 12);        q0 = vmulq_n_f32(q0, alpha);        q1 = vmulq_n_f32(q1, alpha);        q2 = vmulq_n_f32(q2, alpha);        q3 = vmulq_n_f32(q3, alpha);        vst1q_f32(dst + i,      q0);        vst1q_f32(dst + i + 4,  q1);        vst1q_f32(dst + i + 8,  q2);        vst1q_f32(dst + i + 12, q3);    }    for (; i < elemCnt; i++)    {        dst[i] = src[i] * alpha;    }}
开发者ID:shenjiang11,项目名称:iOS,代码行数:36,


示例5: AudioBlockCopyChannelWithScale_NEON

voidAudioBlockCopyChannelWithScale_NEON(const float* aInput,                                    float aScale,                                    float* aOutput){  ASSERT_ALIGNED(aInput);  ASSERT_ALIGNED(aOutput);  float32x4_t vin0, vin1, vin2, vin3;  float32x4_t vout0, vout1, vout2, vout3;  float32x4_t vscale = vmovq_n_f32(aScale);  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=16) {    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));    vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4));    vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8));    vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12));    vout0 = vmulq_f32(vin0, vscale);    vout1 = vmulq_f32(vin1, vscale);    vout2 = vmulq_f32(vin2, vscale);    vout3 = vmulq_f32(vin3, vscale);    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);    vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1);    vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2);    vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3);  }}
开发者ID:Incognito,项目名称:mozilla-central,代码行数:29,


示例6: ScaleErrorSignalNEON

static void ScaleErrorSignalNEON(int extended_filter_enabled,                                 float normal_mu,                                 float normal_error_threshold,                                 float x_pow[PART_LEN1],                                 float ef[2][PART_LEN1]) {  const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;  const float error_threshold = extended_filter_enabled ?      kExtendedErrorThreshold : normal_error_threshold;  const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);  const float32x4_t kMu = vmovq_n_f32(mu);  const float32x4_t kThresh = vmovq_n_f32(error_threshold);  int i;  // vectorized code (four at once)  for (i = 0; i + 3 < PART_LEN1; i += 4) {    const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]);    const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);    const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);    const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f);    float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);    float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);    const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);    const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);    const float32x4_t absEf = vsqrtq_f32(ef_sum2);    const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);    const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);    const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);    uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));    uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));    uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger),                                     vreinterpretq_u32_f32(ef_re));    uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger),                                     vreinterpretq_u32_f32(ef_im));    ef_re_if = vandq_u32(bigger, ef_re_if);    ef_im_if = vandq_u32(bigger, ef_im_if);    ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);    ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);    ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);    ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);    vst1q_f32(&ef[0][i], ef_re);    vst1q_f32(&ef[1][i], ef_im);  }  // scalar code for the remaining items.  for (; i < PART_LEN1; i++) {    float abs_ef;    ef[0][i] /= (x_pow[i] + 1e-10f);    ef[1][i] /= (x_pow[i] + 1e-10f);    abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);    if (abs_ef > error_threshold) {      abs_ef = error_threshold / (abs_ef + 1e-10f);      ef[0][i] *= abs_ef;      ef[1][i] *= abs_ef;    }    // Stepsize factor    ef[0][i] *= mu;    ef[1][i] *= mu;  }}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:59,


示例7: dotProduct

f64 dotProduct(const Size2D &_size,               const f32 * src0Base, ptrdiff_t src0Stride,               const f32 * src1Base, ptrdiff_t src1Stride){    internal::assertSupportedConfiguration();#ifdef CAROTENE_NEON    Size2D size(_size);    if (src0Stride == src1Stride &&        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))    {        size.width *= size.height;        size.height = 1;    }#define DOT_FLOAT_BLOCKSIZE (1 << 13)    f64 result = 0.0;    for (size_t row = 0; row < size.height; ++row)    {        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);        size_t i = 0;        while(i + 4 <= size.width)        {            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;            float32x4_t v_sum = vdupq_n_f32(0.0f);            for( ; i <= lim; i += 4 )            {                internal::prefetch(src0 + i);                internal::prefetch(src1 + i);                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));            }            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);        }        if(i + 2 <= size.width)        {            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);            i += 2;        }        for (; i < size.width; ++i)            result += src0[i] * src1[i];    }    return result;#else    (void)_size;    (void)src0Base;    (void)src0Stride;    (void)src1Base;    (void)src1Stride;    return 0;#endif}
开发者ID:007Indian,项目名称:opencv,代码行数:59,


示例8: qcms_transform_data_rgba_out_lut_neon

void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform,                                           unsigned char *src,                                           unsigned char *dest,                                           size_t length){  size_t i;  unsigned char alpha;  float32_t (*mat)[4] = transform->matrix;  const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r;  const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g;  const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b;  const uint8_t *otdata_r = &transform->output_table_r->data[0];  const uint8_t *otdata_g = &transform->output_table_g->data[0];  const uint8_t *otdata_b = &transform->output_table_b->data[0];  const float32x4_t mat0 = vld1q_f32(mat[0]);  const float32x4_t mat1 = vld1q_f32(mat[1]);  const float32x4_t mat2 = vld1q_f32(mat[2]);  const float32x4_t max   = vld1q_dup_f32(&clampMaxValue);  const float32x4_t min   = vld1q_dup_f32(&zero);  const float32x4_t scale = vld1q_dup_f32(&floatScale);  float32x4_t vec_r, vec_g, vec_b;  int32x4_t result;  /* CYA */  if (!length)    return;  for (i = 0; i < length; i++) {    /* setup for transforming the pixel */    vec_r = vld1q_dup_f32(&igtbl_r[*src++]);    vec_g = vld1q_dup_f32(&igtbl_g[*src++]);    vec_b = vld1q_dup_f32(&igtbl_b[*src++]);    alpha = *src++;    /* gamma * matrix */    vec_r = vmulq_f32(vec_r, mat0);    vec_g = vmulq_f32(vec_g, mat1);    vec_b = vmulq_f32(vec_b, mat2);    /* crunch, crunch, crunch */    vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b));    vec_r = vmaxq_f32(min, vec_r);    vec_r = vminq_f32(max, vec_r);    result = vcvtq_s32_f32(vmulq_f32(vec_r, scale));    /* use calc'd indices to output RGB values */    *dest++ = otdata_r[vgetq_lane_s32(result, 0)];    *dest++ = otdata_g[vgetq_lane_s32(result, 1)];    *dest++ = otdata_b[vgetq_lane_s32(result, 2)];    *dest++ = alpha;  }}
开发者ID:makotokato,项目名称:qcms,代码行数:57,


示例9: AudioBlockPanStereoToStereo_NEON

void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],                                      const float aInputR[WEBAUDIO_BLOCK_SIZE],                                      float aGainL, float aGainR,                                      bool aIsOnTheLeft,                                      float aOutputL[WEBAUDIO_BLOCK_SIZE],                                      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {  ASSERT_ALIGNED(aInputL);  ASSERT_ALIGNED(aInputR);  ASSERT_ALIGNED(aOutputL);  ASSERT_ALIGNED(aOutputR);  float32x4_t vinL0, vinL1;  float32x4_t vinR0, vinR1;  float32x4_t voutL0, voutL1;  float32x4_t voutR0, voutR1;  float32x4_t vscaleL = vmovq_n_f32(aGainL);  float32x4_t vscaleR = vmovq_n_f32(aGainR);  if (aIsOnTheLeft) {    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));      voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);      voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);      voutR0 = vmulq_f32(vinR0, vscaleR);      voutR1 = vmulq_f32(vinR1, vscaleR);      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);    }  } else {    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));      voutL0 = vmulq_f32(vinL0, vscaleL);      voutL1 = vmulq_f32(vinL1, vscaleL);      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);      voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);      voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);    }  }}
开发者ID:jasonLaster,项目名称:gecko-dev,代码行数:60,


示例10: dotProd_i16_neon

void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) {    const int16_t *data = (const int16_t *)dataf;    const int16_t *weights = (const int16_t *)weightsf;    weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t)    for (int i = 0; i < n; i += 4) {        int32x4_t accum0 = { 0, 0, 0, 0 };        int32x4_t accum1 = accum0;        int32x4_t accum2 = accum0;        int32x4_t accum3 = accum0;        for (int j = 0; j < len; j += 8) {            int16x4x2_t d0 = vld2_s16(data + j);            int16x4x2_t w0 = vld2_s16(weights);            int16x4x2_t w1 = vld2_s16(weights + 8);            int16x4x2_t w2 = vld2_s16(weights + 16);            int16x4x2_t w3 = vld2_s16(weights + 24);            accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]);            accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]);            accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]);            accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]);            accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]);            accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]);            accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]);            accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]);            weights += 32;        }        int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0));        int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1));        int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));        int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3));        sum0 = vpadd_s32(sum0, sum1);        sum1 = vpadd_s32(sum2, sum3);        int32x4_t sum = vcombine_s32(sum0, sum1);        float32x4_t val = vcvtq_f32_s32(sum);        val = vmulq_f32(val, vld1q_f32(weightsf + i*2));        val = vmulq_n_f32(val, istd[0]);        val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4));        vst1q_f32(vals + i, val);    }}
开发者ID:IFeelBloated,项目名称:vapoursynth-nnedi3,代码行数:49,


示例11: mw_neon_mm_mul_f32x4

/* f32x4 mm mul */void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C){	int i, k, j;	float32x4_t neon_b, neon_c;	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;	for (i = 0; i < Row; i+=4)	{		for (k = 0; k < Col; k+=1)		{			neon_c = vmovq_n_f32(0);			for (j = 0; j < T; j+=4)			{				int j_T = j * T + i;				int k_Row = k * Row;				neon_a0 = vld1q_f32(A + j_T);				j_T+=Row;				neon_a1 = vld1q_f32(A + j_T);				j_T+=Row;				neon_a2 = vld1q_f32(A + j_T);				j_T+=Row;				neon_a3 = vld1q_f32(A + j_T);				neon_b = vld1q_f32(B + k_Row + j);				neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));				neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));				neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));				neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));				neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);				neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);				neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);				neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);				vst1q_lane_f32(C + k_Row + i, neon_c, 0);				vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1);				vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2);				vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3);			}		}	}}
开发者ID:Daniel-Hwang,项目名称:eeg,代码行数:50,


示例12: t1

void t1(AlignedAddr *addr1, AlignedAddr *addr2) {// CHECK: @t1// CHECK: call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %{{.*}}, i32 16)  float32x4_t a = vld1q_f32(addr1);// CHECK: call void @llvm.arm.neon.vst1.v4f32(i8* %{{.*}}, <4 x float> %{{.*}}, i32 16)  vst1q_f32(addr2, a);}
开发者ID:FrOSt-Foundation,项目名称:clang,代码行数:7,


示例13: ScaleRFFTData

/* * Scale FFT data by 1/|length|. |length| must be a power of two */static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) {  float32_t* data = (float32_t*)fftData;  float32_t scale = 2.0f / length;  if (length >= 4) {    /*     * Do 4 float elements at a time because |length| is always a     * multiple of 4 when |length| >= 4.     *     * TODO(rtoy): Figure out how to process 8 elements at a time     * using intrinsics or replace this with inline assembly.     */    do {      float32x4_t x = vld1q_f32(data);      length -= 4;      x = vmulq_n_f32(x, scale);      vst1q_f32(data, x);      data += 4;    } while (length > 0);  } else if (length == 2) {    float32x2_t x = vld1_f32(data);    x = vmul_n_f32(x, scale);    vst1_f32(data, x);  } else {    fftData[0] *= scale;  }}
开发者ID:venkatarajasekhar,项目名称:Qt,代码行数:31,


示例14: SubbandCoherenceNEON

static void SubbandCoherenceNEON(AecCore* aec,                                 float efw[2][PART_LEN1],                                 float dfw[2][PART_LEN1],                                 float xfw[2][PART_LEN1],                                 float* fft,                                 float* cohde,                                 float* cohxd,                                 int* extreme_filter_divergence) {  int i;  SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence);  {    const float32x4_t vec_1eminus10 =  vdupq_n_f32(1e-10f);    // Subband coherence    for (i = 0; i + 3 < PART_LEN1; i += 4) {      const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);      const float32x4_t vec_se = vld1q_f32(&aec->se[i]);      const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);      const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);      const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);      float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);      float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);      vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);      vec_cohde = vdivq_f32(vec_cohde, vec_sdse);      vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);      vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);      vst1q_f32(&cohde[i], vec_cohde);      vst1q_f32(&cohxd[i], vec_cohxd);    }  }  // scalar code for the remaining items.  for (; i < PART_LEN1; i++) {    cohde[i] =        (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /        (aec->sd[i] * aec->se[i] + 1e-10f);    cohxd[i] =        (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /        (aec->sx[i] * aec->sd[i] + 1e-10f);  }}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:45,


示例15: PartitionDelayNEON

static int PartitionDelayNEON(const AecCore* aec) {  // Measures the energy in each filter partition and returns the partition with  // highest energy.  // TODO(bjornv): Spread computational cost by computing one partition per  // block?  float wfEnMax = 0;  int i;  int delay = 0;  for (i = 0; i < aec->num_partitions; i++) {    int j;    int pos = i * PART_LEN1;    float wfEn = 0;    float32x4_t vec_wfEn = vdupq_n_f32(0.0f);    // vectorized code (four at once)    for (j = 0; j + 3 < PART_LEN1; j += 4) {      const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);      const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);    }    {      float32x2_t vec_total;      // A B C D      vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn));      // A+B C+D      vec_total = vpadd_f32(vec_total, vec_total);      // A+B+C+D A+B+C+D      wfEn = vget_lane_f32(vec_total, 0);    }    // scalar code for the remaining items.    for (; j < PART_LEN1; j++) {      wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +              aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];    }    if (wfEn > wfEnMax) {      wfEnMax = wfEn;      delay = i;    }  }  return delay;}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:44,


示例16: vdupq_n_f32

int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const{    int w = bottom_blob.w;    int h = bottom_blob.h;    int channels = bottom_blob.c;    int size = w * h;    top_blob.create(w, h, channels);    if (top_blob.empty())        return -100;    if (bias_term)    {        const float* scale_ptr = scale_data;        const float* bias_ptr = bias_data;        #pragma omp parallel for        for (int q=0; q<channels; q++)        {            const float* ptr = bottom_blob.channel(q);            float* outptr = top_blob.channel(q);            float s = scale_ptr[q];            float bias = bias_ptr[q];#if __ARM_NEON            int nn = size >> 2;            int remain = size - (nn << 2);#else            int remain = size;#endif // __ARM_NEON#if __ARM_NEON            float32x4_t _s = vdupq_n_f32(s);            float32x4_t _bias = vdupq_n_f32(bias);            for (; nn>0; nn--)            {                float32x4_t _p = vld1q_f32(ptr);                _p = vmlaq_f32(_bias, _p, _s);                vst1q_f32(outptr, _p);                ptr += 4;                outptr += 4;            }#endif // __ARM_NEON            for (; remain>0; remain--)            {                *outptr = *ptr * s + bias;                ptr++;                outptr++;            }        }    }    else    {
开发者ID:superhero1991,项目名称:ncnn,代码行数:56,


示例17: WindowDataNEON

// Window time domain data to be used by the fft.static void WindowDataNEON(float* x_windowed, const float* x) {  int i;  for (i = 0; i < PART_LEN; i += 4) {    const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);    const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);    const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);    // A B C D    float32x4_t vec_sqrtHanning_rev =        vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);    // B A D C    vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);    // D C B A    vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),                                       vget_low_f32(vec_sqrtHanning_rev));    vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));    vst1q_f32(&x_windowed[PART_LEN + i],            vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));  }}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:20,


示例18: compare_neon_ge

void compare_neon_ge(float *psrc1, float src2,  uchar *pdst, int size){	int remainder = size - 7;	float32x4_t vsrc2 = vdupq_n_f32(src2);	int i = 0;	for(; i < remainder; i += 8){		float32x4_t vsrc1_32x4 = vld1q_f32(psrc1 + i ); 		float32x4_t vsrc2_32x4 = vld1q_f32(psrc1 + i + 4); 		uint32x4_t vdst1 = vcgeq_f32(vsrc1_32x4, vsrc2);		uint32x4_t vdst2 = vcgeq_f32(vsrc2_32x4, vsrc2);		uint16x4_t vdst1_16x4 = vmovn_u32(vdst1);		uint16x4_t vdst2_16x4 = vmovn_u32(vdst2);		uint16x8_t vdst_16x8 = vcombine_u16(vdst1_16x4, vdst2_16x4);		uint8x8_t vdst_8x8 = vmovn_u16(vdst_16x8);		vst1_u8(pdst + i, vdst_8x8);	}	for( ; i < size; i++){		pdst[i] = (psrc1[i] >= src2 ) ? 255 : 0;  	}}
开发者ID:Yvaine,项目名称:common_tool,代码行数:20,


示例19: neon_vector_mul

static void neon_vector_mul(const std::vector<float>& vec_a, const std::vector<float>& vec_b, std::vector<float>& vec_result){	assert(vec_a.size() == vec_b.size());	assert(vec_a.size() == vec_result.size());	int i = 0;	//neon process	for (; i < (int)vec_result.size() - 3 ; i+=4)	{		const auto data_a = vld1q_f32(&vec_a[i]);		const auto data_b = vld1q_f32(&vec_b[i]);		float* dst_ptr = &vec_result[i];		const auto data_res = vmulq_f32(data_a, data_b);		vst1q_f32(dst_ptr, data_res);	}	//normal process	for (; i < (int)vec_result.size(); i++)	{		vec_result[i] = vec_a[i] * vec_b[i];	}}
开发者ID:xylcbd,项目名称:blogs_code,代码行数:20,


示例20: saxpy_vector

//Kernel function: saxpyvoid saxpy_vector(KernelArgs* args) {    //Setup    const float32x4_t MASK_FALSE = vdupq_n_f32(0.f);    const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE));        //Uniforms        //Fuses        //Literals        //Stack variables    float32x4_t scale, x, y, result, var060, var061;        //Loop over input    uint64_t index;    for(index = 0; index < args->N; index += 4) {            //Inputs        scale = vld1q_f32(&args->scale[index]);        x = vld1q_f32(&args->x[index]);        y = vld1q_f32(&args->y[index]);                //Begin kernel logic        {                    //>>> result = scale * x + y            var061 = vmulq_f32(scale, x);            var060 = vaddq_f32(var061, y);            result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result);                }        //End kernel logic                //Outputs        vst1q_f32(&args->result[index], result);            }}
开发者ID:yongshanding,项目名称:vecAndroid,代码行数:41,


示例21: mw_neon_mm_add_f32x4

/* f32x4 add */void mw_neon_mm_add_f32x4(float * A, int Row, int Col, float * B, float * C){	float32x4_t neon_a, neon_b, neon_c;	int size = Row * Col;	int i = 0;	int k = 0;	for (i = 4; i <= size ; i+=4)	{		k = i - 4;		neon_a = vld1q_f32(A + k);		neon_b = vld1q_f32(B + k);		neon_c = vaddq_f32(neon_a, neon_b);		vst1q_f32(C + k, neon_c);	}	k = i - 4;    for (i = 0; i < size % 4; i++)	{		C[k + i] = A[k + i] + B[k + i];	}}
开发者ID:Daniel-Hwang,项目名称:eeg,代码行数:23,


示例22: mw_neon_mv_mul_f32x4

/* f32x4 mv mul */void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C){	int i = 0;	int k = 0;	float32x4_t neon_b, neon_c;	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;	for (i = 0; i < Row; i+=4)	{		neon_c = vmovq_n_f32(0);		for (k = 0; k < T; k+=4)		{			int j = k * T + i;			neon_a0 = vld1q_f32(A + j);			neon_a1 = vld1q_f32(A + j + Row);			neon_a2 = vld1q_f32(A + j + 2 * Row);			neon_a3 = vld1q_f32(A + j + 3 * Row);			neon_b = vld1q_f32(B + k);			neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));			neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));			neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));			neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));			neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);			neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);			neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);			neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);		}		vst1q_f32(C + i, neon_c);	}}
开发者ID:Daniel-Hwang,项目名称:eeg,代码行数:39,


示例23: neon_VecdotVec

/** * @brief   vector_dot_vector. * * @param   dst[out]     the output element(1*1) * @param   src1[in]     the input  vector(1*n) *          src2[in]     the input  vector(1*n) *          dimN[in]     size of vector * * @return  void */void neon_VecdotVec(float *dst,                    const float *src1,                    const float *src2,                    const int dimN){    float *mat0 = (float *)src1;    float *mat1 = (float *)src2;    float32x4_t q0 = vld1q_f32(mat0);    float32x4_t q1 = vld1q_f32(mat1);    q0 = vmulq_f32(q0, q1);    int j = 4;    for (; j <= dimN - 4; j += 4)    {        float32x4_t q2 = vld1q_f32(mat0 + j);        float32x4_t q3 = vld1q_f32(mat1 + j);        q0 = vmlaq_f32(q0, q2, q3);    }    float32x2_t d0 = vpadd_f32(vget_low_f32(q0), vget_high_f32(q0));    d0 = vpadd_f32(d0, d0);    *dst = *((float *)&d0);    for (; j < dimN; j++) {        *dst += src1[j] * src2[j];    }}
开发者ID:shenjiang11,项目名称:iOS,代码行数:34,


示例24: KERNEL_NAME

void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {    unsigned int m = n >> 2;    unsigned int k = n & 3, j;    unsigned int l = n & (~3);    for (j = 0; j < m; j++) {        v4sf src = vld1q_f32(a + 4 * j);        v4sf tem = simd_ln4f(src);        vst1q_f32(y + 4 * j, tem);    }    for (j = 0; j < k; j++) {        y[j + l] = logf(a[j + l]);    }}
开发者ID:caomw,项目名称:OpenVML,代码行数:16,


示例25: vdupq_n_f32

int Bias_arm::forward_inplace(Mat& bottom_top_blob) const{    int w = bottom_top_blob.w;    int h = bottom_top_blob.h;    int channels = bottom_top_blob.c;    int size = w * h;    const float* bias_ptr = bias_data;    #pragma omp parallel for    for (int q=0; q<channels; q++)    {        float* ptr = bottom_top_blob.channel(q);        float bias = bias_ptr[q];#if __ARM_NEON        int nn = size >> 2;        int remain = size - (nn << 2);#else        int remain = size;#endif // __ARM_NEON#if __ARM_NEON        float32x4_t _bias = vdupq_n_f32(bias);        for (; nn>0; nn--)        {            float32x4_t _p = vld1q_f32(ptr);            float32x4_t _outp = vaddq_f32(_p, _bias);            vst1q_f32(ptr, _outp);            ptr += 4;        }#endif // __ARM_NEON        for (; remain>0; remain--)        {            *ptr = *ptr + bias;            ptr++;        }    }    return 0;}
开发者ID:superhero1991,项目名称:ncnn,代码行数:44,


示例26: AudioBufferAddWithScale_NEON

void AudioBufferAddWithScale_NEON(const float* aInput,                                  float aScale,                                  float* aOutput,                                  uint32_t aSize){  ASSERT_ALIGNED(aInput);  ASSERT_ALIGNED(aOutput);  float32x4_t vin0, vin1, vin2, vin3;  float32x4_t vout0, vout1, vout2, vout3;  float32x4_t vscale = vmovq_n_f32(aScale);  uint32_t dif = aSize % 16;  aSize -= dif;  unsigned i = 0;  for (; i < aSize; i+=16) {    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));    vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4));    vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8));    vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12));    vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));    vout1 = vld1q_f32(ADDRESS_OF(aOutput, i+4));    vout2 = vld1q_f32(ADDRESS_OF(aOutput, i+8));    vout3 = vld1q_f32(ADDRESS_OF(aOutput, i+12));    vout0 = vmlaq_f32(vout0, vin0, vscale);    vout1 = vmlaq_f32(vout1, vin1, vscale);    vout2 = vmlaq_f32(vout2, vin2, vscale);    vout3 = vmlaq_f32(vout3, vin3, vscale);    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);    vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1);    vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2);    vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3);  }  for (unsigned j = 0; j < dif; ++i, ++j) {    aOutput[i] += aInput[i]*aScale;  }}
开发者ID:Incognito,项目名称:mozilla-central,代码行数:41,


示例27: test_square_root_v4sf

voidtest_square_root_v4sf (){  const float32_t pool[] = {4.0f, 9.0f, 16.0f, 25.0f};  float32x4_t val;  float32x4_t res;  val = vld1q_f32 (pool);  res = vsqrtq_f32 (val);  if (vgetq_lane_f32 (res, 0) != 2.0f)    abort ();  if (vgetq_lane_f32 (res, 1) != 3.0f)    abort ();  if (vgetq_lane_f32 (res, 2) != 4.0f)    abort ();  if (vgetq_lane_f32 (res, 3) != 5.0f)    abort ();}
开发者ID:0day-ci,项目名称:gcc,代码行数:19,


示例28: main

intmain (void){  volatile float32_t minus_e, pi, ln2, sqrt2, phi;  float32_t expected, actual;  float32x4_t arg2;  float32_t arr[4];  pi = 3.14159265359;  arr[0] = minus_e = -2.71828;  arr[1] = ln2 = 0.69314718056;  arr[2] = sqrt2 = 1.41421356237;  arr[3] = phi = 1.61803398874;  arg2 = vld1q_f32 (arr);  actual = vmuls_laneq_f32 (pi, arg2, 0);  expected = pi * minus_e;  if (expected != actual)    abort ();  expected = pi * ln2;  actual = vmuls_laneq_f32 (pi, arg2, 1);  if (expected != actual)    abort ();  expected = pi * sqrt2;  actual = vmuls_laneq_f32 (pi, arg2, 2);  if (expected != actual)    abort ();  expected = pi * phi;  actual = vmuls_laneq_f32 (pi, arg2, 3);  if (expected != actual)    abort ();  return 0;}
开发者ID:0day-ci,项目名称:gcc,代码行数:41,


示例29: neon_axpby

/** * @brief   vector scale & accu: A[] = alpha * B[] + beta * A[]. * * @param   dst[out]    the accumulating matrix A. *          src[in]     the input matrix B. *          alpha[in]   scale of B. *          beta[in]    scale of A. *          elemCnt[in] number of elements to calc. * * @return  void. */void neon_axpby(float *dst,                const float *src,                const float alpha,                const float beta,                const int elemCnt){    int i;    for (i = 0; i <= elemCnt - 16; i += 16)    {        float32x4_t q0 = vld1q_f32(src + i);        float32x4_t q1 = vld1q_f32(src + i + 4);        float32x4_t q2 = vld1q_f32(src + i + 8);        float32x4_t q3 = vld1q_f32(src + i + 12);        float32x4_t q4 = vld1q_f32(dst + i);        float32x4_t q5 = vld1q_f32(dst + i + 4);        float32x4_t q6 = vld1q_f32(dst + i + 8);        float32x4_t q7 = vld1q_f32(dst + i + 12);        q0 = vmulq_n_f32(q0, alpha);        q1 = vmulq_n_f32(q1, alpha);        q2 = vmulq_n_f32(q2, alpha);        q3 = vmulq_n_f32(q3, alpha);        q0 = vmlaq_n_f32(q0, q4, beta);        q1 = vmlaq_n_f32(q1, q5, beta);        q2 = vmlaq_n_f32(q2, q6, beta);        q3 = vmlaq_n_f32(q3, q7, beta);        vst1q_f32(dst + i,      q0);        vst1q_f32(dst + i + 4,  q1);        vst1q_f32(dst + i + 8,  q2);        vst1q_f32(dst + i + 12, q3);    }    for (; i < elemCnt; i++)    {        float a = src[i] * alpha + dst[i] * beta;        dst[i] = a;    }}
开发者ID:shenjiang11,项目名称:iOS,代码行数:47,


示例30: FilterFarNEON

static void FilterFarNEON(    int num_partitions,    int x_fft_buf_block_pos,    float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],    float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],    float y_fft[2][PART_LEN1]) {  int i;  for (i = 0; i < num_partitions; i++) {    int j;    int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;    int pos = i * PART_LEN1;    // Check for wrap    if (i + x_fft_buf_block_pos >= num_partitions) {      xPos -= num_partitions * PART_LEN1;    }    // vectorized code (four at once)    for (j = 0; j + 3 < PART_LEN1; j += 4) {      const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);      const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);      const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]);      const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]);      const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]);      const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]);      const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re);      const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im);      const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im);      const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re);      const float32x4_t g = vaddq_f32(y_fft_re, e);      const float32x4_t h = vaddq_f32(y_fft_im, f);      vst1q_f32(&y_fft[0][j], g);      vst1q_f32(&y_fft[1][j], h);    }    // scalar code for the remaining items.    for (; j < PART_LEN1; j++) {      y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j],                           x_fft_buf[1][xPos + j],                           h_fft_buf[0][pos + j],                           h_fft_buf[1][pos + j]);      y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],                           x_fft_buf[1][xPos + j],                           h_fft_buf[0][pos + j],                           h_fft_buf[1][pos + j]);    }  }}
开发者ID:Crawping,项目名称:chromium_extract,代码行数:46,



注:本文中的vld1q_f32函数示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


C++ vld1q_u8函数代码示例
C++ vld1_u8函数代码示例
万事OK自学网:51自学网_软件自学网_CAD自学网自学excel、自学PS、自学CAD、自学C语言、自学css3实例,是一个通过网络自主学习工作技能的自学平台,网友喜欢的软件自学网站。