您当前的位置:首页 > IT编程 > C++
| C语言 | Java | VB | VC | python | Android | TensorFlow | C++ | oracle | 学术与代码 | cnn卷积神经网络 | gnn | 图像修复 | Keras | 数据集 | Neo4j | 自然语言处理 | 深度学习 | 医学CAD | 医学影像 | 超参数 | pointnet | pytorch | 异常检测 | Transformers | 情感分类 | 知识图谱 |

自学教程:C++ vld1_u8函数代码示例

51自学网 2021-06-03 09:44:54
  C++
这篇教程C++ vld1_u8函数代码示例写得很实用,希望能帮到您。

本文整理汇总了C++中vld1_u8函数的典型用法代码示例。如果您正苦于以下问题:C++ vld1_u8函数的具体用法?C++ vld1_u8怎么用?C++ vld1_u8使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了vld1_u8函数的30个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: variance_neon_w8

static void variance_neon_w8(const uint8_t *a, int a_stride,                             const uint8_t *b, int b_stride,                             int w, int h, unsigned int *sse, int *sum) {  int i, j;  int16x8_t v_sum = vdupq_n_s16(0);  int32x4_t v_sse_lo = vdupq_n_s32(0);  int32x4_t v_sse_hi = vdupq_n_s32(0);  for (i = 0; i < h; ++i) {    for (j = 0; j < w; j += 8) {      const uint8x8_t v_a = vld1_u8(&a[j]);      const uint8x8_t v_b = vld1_u8(&b[j]);      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);      v_sum = vaddq_s16(v_sum, sv_diff);      v_sse_lo = vmlal_s16(v_sse_lo,                           vget_low_s16(sv_diff),                           vget_low_s16(sv_diff));      v_sse_hi = vmlal_s16(v_sse_hi,                           vget_high_s16(sv_diff),                           vget_high_s16(sv_diff));    }    a += a_stride;    b += b_stride;  }  *sum = horizontal_add_s16x8(v_sum);  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));}
开发者ID:Andrel322,项目名称:gecko-dev,代码行数:29,


示例2: ConvertARGBToUV_NEON

static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,                                 int src_width, int do_store) {  int i;  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {    const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);    const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds    const uint16x8_t G = vpaddlq_u8(RGB.val[1]);    const uint16x8_t B = vpaddlq_u8(RGB.val[0]);    int16x8_t U_tmp, V_tmp;    CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);    {      const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);      const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);      if (do_store) {        vst1_u8(u, U);        vst1_u8(v, V);      } else {        const uint8x8_t prev_u = vld1_u8(u);        const uint8x8_t prev_v = vld1_u8(v);        vst1_u8(u, vrhadd_u8(U, prev_u));        vst1_u8(v, vrhadd_u8(V, prev_v));      }    }  }  if (i < src_width) {  // left-over    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);  }}
开发者ID:1vanK,项目名称:Urho3D,代码行数:28,


示例3: main

int main(void){    uint8_t v1_init[8] = {1, 1, 1, 1, 1, 1, 1, 1};    uint8_t v2_init[8] = {2, 2, 2, 2, 2, 2, 2, 2};    uint8x8_t v1 = vld1_u8 (v1_init);    uint8x8_t v2 = vld1_u8 (v2_init);    uint8x8x2_t vd1, vd2;    union {uint8x8_t v; uint8_t buf[8];} d1, d2, d3, d4;    int i;    uint8_t odd, even;    vd1 = vzip_u8(v1, vdup_n_u8(0));    vd2 = vzip_u8(v2, vdup_n_u8(0));    vst1_u8(d1.buf, vd1.val[0]);    vst1_u8(d2.buf, vd1.val[1]);    vst1_u8(d3.buf, vd2.val[0]);    vst1_u8(d4.buf, vd2.val[1]);#ifdef __ARMEL__    odd = 1;    even = 0;#else    odd = 0;    even = 1;#endif    for (i = 0; i < 8; i++)      if ((i % 2 == even && d4.buf[i] != 2)          || (i % 2 == odd && d4.buf[i] != 0))         abort ();    return 0;}
开发者ID:ChaosJohn,项目名称:gcc,代码行数:34,


示例4: byte2float48_neon

void byte2float48_neon(const uint8_t *t, const int pitch, float *p) {    uint16x8_t m0, m1, m2, m3, m4, m5;    uint32x2_t temp1, temp4;    m0 = vmovl_u8(vld1_u8(t));    temp1 = vld1_lane_u32((const uint32_t *)(t + 8), temp1, 0);    temp1 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp1, 1);    m1 = vmovl_u8(vreinterpret_u8_u32(temp1));    m2 = vmovl_u8(vld1_u8(t + pitch * 2 + 4));    t += pitch * 4;    m3 = vmovl_u8(vld1_u8(t));    temp4 = vld1_lane_u32((const uint32_t *)(t + 8), temp4, 0);    temp4 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp4, 1);    m4 = vmovl_u8(vreinterpret_u8_u32(temp4));    m5 = vmovl_u8(vld1_u8(t + pitch * 2 + 4));    vst1q_f32(p, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m0))));    vst1q_f32(p + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m0))));    vst1q_f32(p + 8, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m1))));    vst1q_f32(p + 12, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m1))));    vst1q_f32(p + 16, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m2))));    vst1q_f32(p + 20, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m2))));    vst1q_f32(p + 24, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m3))));    vst1q_f32(p + 28, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m3))));    vst1q_f32(p + 32, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m4))));    vst1q_f32(p + 36, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m4))));    vst1q_f32(p + 40, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m5))));    vst1q_f32(p + 44, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m5))));}
开发者ID:IFeelBloated,项目名称:vapoursynth-nnedi3,代码行数:31,


示例5: ConvertBGRAToRGBA

static void ConvertBGRAToRGBA(const uint32_t* src,                              int num_pixels, uint8_t* dst) {  const uint32_t* const end = src + (num_pixels & ~1);  const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);  for (; src < end; src += 2) {    const uint8x8_t pixels = vld1_u8((uint8_t*)src);    vst1_u8(dst, vtbl1_u8(pixels, shuffle));    dst += 8;  }  VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst);  // left-overs}
开发者ID:0109yuma,项目名称:drrrbile-reader,代码行数:11,


示例6: vp9_avg_8x8_neon

unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {  uint8x8_t v_s0 = vld1_u8(s);  const uint8x8_t v_s1 = vld1_u8(s + p);  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);  v_s0 = vld1_u8(s + 2 * p);  v_sum = vaddw_u8(v_sum, v_s0);  v_s0 = vld1_u8(s + 3 * p);  v_sum = vaddw_u8(v_sum, v_s0);  v_s0 = vld1_u8(s + 4 * p);  v_sum = vaddw_u8(v_sum, v_s0);  v_s0 = vld1_u8(s + 5 * p);  v_sum = vaddw_u8(v_sum, v_s0);  v_s0 = vld1_u8(s + 6 * p);  v_sum = vaddw_u8(v_sum, v_s0);  v_s0 = vld1_u8(s + 7 * p);  v_sum = vaddw_u8(v_sum, v_s0);  return (horizontal_add_u16x8(v_sum) + 32) >> 6;}
开发者ID:d4rkuzs,项目名称:ffmpeg-codecs-libs,代码行数:25,


示例7: read_4x8

static INLINEuint8x8x4_t read_4x8(unsigned char *src, int pitch) {    uint8x8x4_t x;    const uint8x8_t a = vld1_u8(src);    const uint8x8_t b = vld1_u8(src + pitch * 1);    const uint8x8_t c = vld1_u8(src + pitch * 2);    const uint8x8_t d = vld1_u8(src + pitch * 3);    const uint8x8_t e = vld1_u8(src + pitch * 4);    const uint8x8_t f = vld1_u8(src + pitch * 5);    const uint8x8_t g = vld1_u8(src + pitch * 6);    const uint8x8_t h = vld1_u8(src + pitch * 7);    const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),                                          vreinterpret_u32_u8(e));    const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),                                          vreinterpret_u32_u8(f));    const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),                                          vreinterpret_u32_u8(g));    const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),                                          vreinterpret_u32_u8(h));    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),                                          vreinterpret_u16_u32(r26_u32.val[0]));    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),                                          vreinterpret_u16_u32(r37_u32.val[0]));    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),                                       vreinterpret_u8_u16(r13_u16.val[0]));    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),                                       vreinterpret_u8_u16(r13_u16.val[1]));    /*     * after vtrn_u32    00 01 02 03 | 40 41 42 43    10 11 12 13 | 50 51 52 53    20 21 22 23 | 60 61 62 63    30 31 32 33 | 70 71 72 73    ---    * after vtrn_u16    00 01 20 21 | 40 41 60 61    02 03 22 23 | 42 43 62 63    10 11 30 31 | 50 51 70 71    12 13 32 33 | 52 52 72 73    00 01 20 21 | 40 41 60 61    10 11 30 31 | 50 51 70 71    02 03 22 23 | 42 43 62 63    12 13 32 33 | 52 52 72 73    ---    * after vtrn_u8    00 10 20 30 | 40 50 60 70    01 11 21 31 | 41 51 61 71    02 12 22 32 | 42 52 62 72    03 13 23 33 | 43 53 63 73    */    x.val[0] = r01_u8.val[0];    x.val[1] = r01_u8.val[1];    x.val[2] = r23_u8.val[0];    x.val[3] = r23_u8.val[1];    return x;}
开发者ID:93i,项目名称:godot,代码行数:58,


示例8: char_to_float_vectors

static inline void char_to_float_vectors(const unsigned char * sourcep,			   float32x4_t *mp0, float32x4_t * mp1){ uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1}   */ int16x8_t widerpixels; /*  rawpixels promoted to shorts per component */ int16x4_t high16, low16; int32x4_t high32, low32; const  int16x8_t uvbias = {0, 128, 0, 128, 0, 128, 0, 128};  rawpixels = vld1_u8(sourcep); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* subtract uvbias from widerpixels  */ widerpixels = vsubq_s16(widerpixels, uvbias); /* now take widerpixels apart into (low16, high16) and   */ /* then expand those into (low32, high32)    */ low16 = vget_low_s16(widerpixels); high16 = vget_high_s16(widerpixels); high32 = vmovl_s16(high16); low32  = vmovl_s16(low16); /* now convert low32 and high32 into floats and store them in   */ /*  *mp0,  *mp1 */ *mp0 = vcvtq_f32_s32(low32); *mp1 = vcvtq_f32_s32(high32);  }
开发者ID:yaojingguo,项目名称:gcc-intrinsics-samplecode,代码行数:29,


示例9: vpx_d135_predictor_4x4_neon

void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,                                 const uint8_t *above, const uint8_t *left) {  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);  const uint32x2_t zero = vdup_n_u32(0);  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));  const uint8_t D = vget_lane_u8(XABCD_u8, 4);  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);}
开发者ID:jmvalin,项目名称:aom,代码行数:27,


示例10: vpx_v_predictor_8x8_neon

void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,                              const uint8_t *above, const uint8_t *left) {  int i;  uint8x8_t d0u8 = vdup_n_u8(0);  (void)left;  d0u8 = vld1_u8(above);  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);}
开发者ID:jmvalin,项目名称:aom,代码行数:9,


示例11: vpx_d45_predictor_8x8_neon

void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,                                const uint8_t *above, const uint8_t *left) {  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);  const uint8x8_t A0 = vld1_u8(above);  // top row  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);  const uint8x8_t avg1 = vhadd_u8(A0, A2);  uint8x8_t row = vrhadd_u8(avg1, A1);  int i;  (void)left;  for (i = 0; i < 7; ++i) {    vst1_u8(dst + i * stride, row);    row = vtbl1_u8(row, sh_12345677);  }  vst1_u8(dst + i * stride, row);}
开发者ID:jmvalin,项目名称:aom,代码行数:19,


示例12: byte2word64_neon

void byte2word64_neon(const uint8_t *t, const int pitch, float *pf) {    uint16_t *p = (uint16_t *)pf;    vst1q_u16(p, vmovl_u8(vld1_u8(t)));    vst1q_u16(p + 8, vmovl_u8(vld1_u8(t + 8)));    vst1q_u16(p + 16, vmovl_u8(vld1_u8(t + pitch * 2)));    vst1q_u16(p + 24, vmovl_u8(vld1_u8(t + pitch * 2 + 8)));    vst1q_u16(p + 32, vmovl_u8(vld1_u8(t + pitch * 4)));    vst1q_u16(p + 40, vmovl_u8(vld1_u8(t + pitch * 4 + 8)));    vst1q_u16(p + 48, vmovl_u8(vld1_u8(t + pitch * 6)));    vst1q_u16(p + 56, vmovl_u8(vld1_u8(t + pitch * 6 + 8)));}
开发者ID:IFeelBloated,项目名称:vapoursynth-nnedi3,代码行数:12,


示例13: vpx_get4x4sse_cs_neon

unsigned int vpx_get4x4sse_cs_neon(        const unsigned char *src_ptr,        int source_stride,        const unsigned char *ref_ptr,        int recon_stride) {    int16x4_t d22s16, d24s16, d26s16, d28s16;    int64x1_t d0s64;    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;    int32x4_t q7s32, q8s32, q9s32, q10s32;    uint16x8_t q11u16, q12u16, q13u16, q14u16;    int64x2_t q1s64;    d0u8 = vld1_u8(src_ptr);    src_ptr += source_stride;    d4u8 = vld1_u8(ref_ptr);    ref_ptr += recon_stride;    d1u8 = vld1_u8(src_ptr);    src_ptr += source_stride;    d5u8 = vld1_u8(ref_ptr);    ref_ptr += recon_stride;    d2u8 = vld1_u8(src_ptr);    src_ptr += source_stride;    d6u8 = vld1_u8(ref_ptr);    ref_ptr += recon_stride;    d3u8 = vld1_u8(src_ptr);    src_ptr += source_stride;    d7u8 = vld1_u8(ref_ptr);    ref_ptr += recon_stride;    q11u16 = vsubl_u8(d0u8, d4u8);    q12u16 = vsubl_u8(d1u8, d5u8);    q13u16 = vsubl_u8(d2u8, d6u8);    q14u16 = vsubl_u8(d3u8, d7u8);    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));    q7s32 = vmull_s16(d22s16, d22s16);    q8s32 = vmull_s16(d24s16, d24s16);    q9s32 = vmull_s16(d26s16, d26s16);    q10s32 = vmull_s16(d28s16, d28s16);    q7s32 = vaddq_s32(q7s32, q8s32);    q9s32 = vaddq_s32(q9s32, q10s32);    q9s32 = vaddq_s32(q7s32, q9s32);    q1s64 = vpaddlq_s32(q9s32);    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);}
开发者ID:lianhaidong,项目名称:libvpx,代码行数:53,


示例14: dc_8x8

// 'do_above' and 'do_left' facilitate branch removal when inlined.static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,                          const uint8_t *above, const uint8_t *left,                          int do_above, int do_left) {  uint16x8_t sum_top;  uint16x8_t sum_left;  uint8x8_t dc0;  if (do_above) {    const uint8x8_t A = vld1_u8(above);  // top row    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top    const uint16x4_t p1 = vpadd_u16(p0, p0);    const uint16x4_t p2 = vpadd_u16(p1, p1);    sum_top = vcombine_u16(p2, p2);  }  if (do_left) {    const uint8x8_t L = vld1_u8(left);  // left border    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left    const uint16x4_t p1 = vpadd_u16(p0, p0);    const uint16x4_t p2 = vpadd_u16(p1, p1);    sum_left = vcombine_u16(p2, p2);  }  if (do_above && do_left) {    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);    dc0 = vrshrn_n_u16(sum, 4);  } else if (do_above) {    dc0 = vrshrn_n_u16(sum_top, 3);  } else if (do_left) {    dc0 = vrshrn_n_u16(sum_left, 3);  } else {    dc0 = vdup_n_u8(0x80);  }  {    const uint8x8_t dc = vdup_lane_u8(dc0, 0);    int i;    for (i = 0; i < 8; ++i) {      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));    }  }}
开发者ID:MekliCZ,项目名称:positron,代码行数:43,


示例15: var_filter_block2d_bil_w8

// Process a block exactly 8 wide and any height.static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,                                      uint8_t *output_ptr,                                      unsigned int src_pixels_per_line,                                      int pixel_step,                                      unsigned int output_height,                                      const uint8_t *filter) {  const uint8x8_t f0 = vdup_n_u8(filter[0]);  const uint8x8_t f1 = vdup_n_u8(filter[1]);  unsigned int i;  for (i = 0; i < output_height; ++i) {    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);    const uint16x8_t a = vmull_u8(src_0, f0);    const uint16x8_t b = vmlal_u8(a, src_1, f1);    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);    vst1_u8(output_ptr, out);    src_ptr += src_pixels_per_line;    output_ptr += 8;  }}
开发者ID:webmproject,项目名称:libvpx,代码行数:21,


示例16: vp9_tm_predictor_8x8_neon

void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,                               const uint8_t *above, const uint8_t *left) {  int j;  uint16x8_t q0u16, q3u16, q10u16;  int16x8_t q0s16;  uint16x4_t d20u16;  uint8x8_t d0u8, d2u8, d30u8;  d0u8 = vld1_dup_u8(above - 1);  d30u8 = vld1_u8(left);  d2u8 = vld1_u8(above);  q10u16 = vmovl_u8(d30u8);  q3u16 = vsubl_u8(d2u8, d0u8);  d20u16 = vget_low_u16(q10u16);  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {    q0u16 = vdupq_lane_u16(d20u16, 0);    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),                      vreinterpretq_s16_u16(q0u16));    d0u8 = vqmovun_s16(q0s16);    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));    dst += stride;    q0u16 = vdupq_lane_u16(d20u16, 1);    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),                      vreinterpretq_s16_u16(q0u16));    d0u8 = vqmovun_s16(q0s16);    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));    dst += stride;    q0u16 = vdupq_lane_u16(d20u16, 2);    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),                      vreinterpretq_s16_u16(q0u16));    d0u8 = vqmovun_s16(q0s16);    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));    dst += stride;    q0u16 = vdupq_lane_u16(d20u16, 3);    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),                      vreinterpretq_s16_u16(q0u16));    d0u8 = vqmovun_s16(q0s16);    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));    dst += stride;  }}
开发者ID:MekliCZ,项目名称:positron,代码行数:41,


示例17: vp8_copy_mem8x8_neon

void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride,                          unsigned char *dst, int dst_stride) {  uint8x8_t vtmp;  int r;  for (r = 0; r < 8; ++r) {    vtmp = vld1_u8(src);    vst1_u8(dst, vtmp);    src += src_stride;    dst += dst_stride;  }}
开发者ID:webmproject,项目名称:libvpx,代码行数:12,


示例18: var_filter_block2d_bil_w8

static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,                                      uint8_t *output_ptr,                                      unsigned int src_pixels_per_line,                                      int pixel_step,                                      unsigned int output_height,                                      unsigned int output_width,                                      const uint16_t *vpx_filter) {  const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);  const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);  unsigned int i;  for (i = 0; i < output_height; ++i) {    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);    const uint16x8_t a = vmull_u8(src_0, f0);    const uint16x8_t b = vmlal_u8(a, src_1, f1);    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);    vst1_u8(&output_ptr[0], out);    // Next row...    src_ptr += src_pixels_per_line;    output_ptr += output_width;  }}
开发者ID:Andrel322,项目名称:gecko-dev,代码行数:22,


示例19: AddGreenToBlueAndRed

static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {  const uint32_t* const end = argb_data + (num_pixels & ~3);  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);  for (; argb_data < end; argb_data += 4) {    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);    const uint8x16_t greens =        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),                    vtbl1_u8(vget_high_u8(argb), shuffle));    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));  }  // fallthrough and finish off with plain-C  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);}
开发者ID:0109yuma,项目名称:drrrbile-reader,代码行数:13,


示例20: byte2word48_neon

void byte2word48_neon(const uint8_t *t, const int pitch, float *pf) {    uint16_t *p = (uint16_t *)pf;    uint8x8_t m0, m1, m2, m3, m4, m5;    m0 = vld1_u8(t);    m1 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + 8), vreinterpret_u32_u8(m1), 0));    m1 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + pitch * 2), vreinterpret_u32_u8(m1), 1));    m2 = vld1_u8(t + pitch * 2 + 4);    t += pitch * 4;    m3 = vld1_u8(t);    m4 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + 8), vreinterpret_u32_u8(m4), 0));    m4 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + pitch * 2), vreinterpret_u32_u8(m4), 1));    m5 = vld1_u8(t + pitch * 2 + 4);    vst1q_u16(p, vmovl_u8(m0));    vst1q_u16(p + 8, vmovl_u8(m1));    vst1q_u16(p + 16, vmovl_u8(m2));    vst1q_u16(p + 24, vmovl_u8(m3));    vst1q_u16(p + 32, vmovl_u8(m4));    vst1q_u16(p + 40, vmovl_u8(m5));}
开发者ID:IFeelBloated,项目名称:vapoursynth-nnedi3,代码行数:24,


示例21: SubtractGreenFromBlueAndRed

static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {  const uint32_t* const end = argb_data + (num_pixels & ~3);#ifdef USE_VTBLQ  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);#else  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);#endif  for (; argb_data < end; argb_data += 4) {    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));  }  // fallthrough and finish off with plain-C  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);}
开发者ID:BBLionel,项目名称:libwebp,代码行数:15,


示例22: test_vdupb_lane_u8

test_vdupb_lane_u8 (){  uint8x8_t a;  uint8_t b;  uint8_t c[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };  a = vld1_u8 (c);  b = wrap_vdupb_lane_u8_0 (a, a);  if (c[0] != b)    return 1;  b = wrap_vdupb_lane_u8_1 (a);  if (c[1] != b)    return 1;  return 0;}
开发者ID:0day-ci,项目名称:gcc,代码行数:15,


示例23: vpx_lpf_horizontal_4_dual_neon

void vpx_lpf_horizontal_4_dual_neon(    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,    const uint8_t *limit1, const uint8_t *thresh1) {  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;  uint8x16_t qblimit, qlimit, qthresh;  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;  dblimit0 = vld1_u8(blimit0);  dlimit0 = vld1_u8(limit0);  dthresh0 = vld1_u8(thresh0);  dblimit1 = vld1_u8(blimit1);  dlimit1 = vld1_u8(limit1);  dthresh1 = vld1_u8(thresh1);  qblimit = vcombine_u8(dblimit0, dblimit1);  qlimit = vcombine_u8(dlimit0, dlimit1);  qthresh = vcombine_u8(dthresh0, dthresh1);  s -= (p << 2);  q3u8 = vld1q_u8(s);  s += p;  q4u8 = vld1q_u8(s);  s += p;  q5u8 = vld1q_u8(s);  s += p;  q6u8 = vld1q_u8(s);  s += p;  q7u8 = vld1q_u8(s);  s += p;  q8u8 = vld1q_u8(s);  s += p;  q9u8 = vld1q_u8(s);  s += p;  q10u8 = vld1q_u8(s);  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);  s -= (p * 5);  vst1q_u8(s, q5u8);  s += p;  vst1q_u8(s, q6u8);  s += p;  vst1q_u8(s, q7u8);  s += p;  vst1q_u8(s, q8u8);  return;}
开发者ID:Corax26,项目名称:libvpx,代码行数:49,


示例24: ConvertBGRAToBGR

static void ConvertBGRAToBGR(const uint32_t* src,                             int num_pixels, uint8_t* dst) {  const uint32_t* const end = src + (num_pixels & ~7);  const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);  const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);  const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);  for (; src < end; src += 8) {    uint8x8x4_t pixels;    INIT_VECTOR4(pixels,                 vld1_u8((const uint8_t*)(src + 0)),                 vld1_u8((const uint8_t*)(src + 2)),                 vld1_u8((const uint8_t*)(src + 4)),                 vld1_u8((const uint8_t*)(src + 6)));    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));    dst += 8 * 3;  }  VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst);  // left-overs}
开发者ID:0109yuma,项目名称:drrrbile-reader,代码行数:20,


示例25: vpx_d45_predictor_4x4_neon

void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,                                const uint8_t *above, const uint8_t *left) {  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row  const uint64x1_t A1 = vshr_n_u64(A0, 8);  const uint64x1_t A2 = vshr_n_u64(A0, 16);  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));  (void)left;  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);  dst[3 * stride + 3] = above[7];}
开发者ID:jmvalin,项目名称:aom,代码行数:22,


示例26: aom_lpf_horizontal_8_neon

void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,                               const uint8_t *limit, const uint8_t *thresh) {  int i;  uint8_t *s, *psrc;  uint8x8_t dblimit, dlimit, dthresh;  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;  uint8x8_t d16u8, d17u8, d18u8;  dblimit = vld1_u8(blimit);  dlimit = vld1_u8(limit);  dthresh = vld1_u8(thresh);  psrc = src - (pitch << 2);  for (i = 0; i < 1; i++) {    s = psrc + i * 8;    d3u8 = vld1_u8(s);    s += pitch;    d4u8 = vld1_u8(s);    s += pitch;    d5u8 = vld1_u8(s);    s += pitch;    d6u8 = vld1_u8(s);    s += pitch;    d7u8 = vld1_u8(s);    s += pitch;    d16u8 = vld1_u8(s);    s += pitch;    d17u8 = vld1_u8(s);    s += pitch;    d18u8 = vld1_u8(s);    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,                       &d5u8);    s -= (pitch * 6);    vst1_u8(s, d0u8);    s += pitch;    vst1_u8(s, d1u8);    s += pitch;    vst1_u8(s, d2u8);    s += pitch;    vst1_u8(s, d3u8);    s += pitch;    vst1_u8(s, d4u8);    s += pitch;    vst1_u8(s, d5u8);  }  return;}
开发者ID:Wafflespeanut,项目名称:gecko-dev,代码行数:51,


示例27: aom_lpf_vertical_8_neon

void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,                             const uint8_t *limit, const uint8_t *thresh) {  int i;  uint8_t *s;  uint8x8_t dblimit, dlimit, dthresh;  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;  uint8x8_t d16u8, d17u8, d18u8;  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;  uint8x8x4_t d4Result;  uint8x8x2_t d2Result;  dblimit = vld1_u8(blimit);  dlimit = vld1_u8(limit);  dthresh = vld1_u8(thresh);  for (i = 0; i < 1; i++) {    s = src + (i * (pitch << 3)) - 4;    d3u8 = vld1_u8(s);    s += pitch;    d4u8 = vld1_u8(s);    s += pitch;    d5u8 = vld1_u8(s);    s += pitch;    d6u8 = vld1_u8(s);    s += pitch;    d7u8 = vld1_u8(s);    s += pitch;    d16u8 = vld1_u8(s);    s += pitch;    d17u8 = vld1_u8(s);    s += pitch;    d18u8 = vld1_u8(s);    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),                      vreinterpret_u16_u32(d2tmp2.val[0]));    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),                      vreinterpret_u16_u32(d2tmp3.val[0]));    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),                      vreinterpret_u16_u32(d2tmp2.val[1]));    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),                      vreinterpret_u16_u32(d2tmp3.val[1]));    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),                     vreinterpret_u8_u16(d2tmp5.val[0]));    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),                     vreinterpret_u8_u16(d2tmp5.val[1]));    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),                      vreinterpret_u8_u16(d2tmp7.val[0]));    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),                      vreinterpret_u8_u16(d2tmp7.val[1]));    d3u8 = d2tmp8.val[0];    d4u8 = d2tmp8.val[1];    d5u8 = d2tmp9.val[0];    d6u8 = d2tmp9.val[1];    d7u8 = d2tmp10.val[0];    d16u8 = d2tmp10.val[1];    d17u8 = d2tmp11.val[0];    d18u8 = d2tmp11.val[1];    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,                       &d5u8);    d4Result.val[0] = d0u8;    d4Result.val[1] = d1u8;    d4Result.val[2] = d2u8;    d4Result.val[3] = d3u8;    d2Result.val[0] = d4u8;    d2Result.val[1] = d5u8;    s = src - 3;    vst4_lane_u8(s, d4Result, 0);    s += pitch;    vst4_lane_u8(s, d4Result, 1);    s += pitch;    vst4_lane_u8(s, d4Result, 2);    s += pitch;    vst4_lane_u8(s, d4Result, 3);    s += pitch;    vst4_lane_u8(s, d4Result, 4);    s += pitch;    vst4_lane_u8(s, d4Result, 5);    s += pitch;    vst4_lane_u8(s, d4Result, 6);    s += pitch;    vst4_lane_u8(s, d4Result, 7);    s = src + 1;    vst2_lane_u8(s, d2Result, 0);    s += pitch;//.........这里部分代码省略.........
开发者ID:Wafflespeanut,项目名称:gecko-dev,代码行数:101,


示例28: crypto_stream_xor

int crypto_stream_xor(        unsigned char *c,  const unsigned char *m,unsigned long long mlen,  const unsigned char *n,  const unsigned char *k){  const uint32x4_t abab = {-1,0,-1,0};  const uint64x1_t nextblock = {1};  uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k);  uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16));  uint32x4_t start0 = (uint32x4_t) vld1q_u8((uint8_t *) sigma);  uint32x2_t n0n1 = (uint32x2_t) vld1_u8((uint8_t *) n);  uint32x2_t n2n3 = {0,0};  uint32x2_t k0k1 = vget_low_u32(k0k1k2k3);  uint32x2_t k2k3 = vget_high_u32(k0k1k2k3);  uint32x2_t k4k5 = vget_low_u32(k4k5k6k7);  uint32x2_t k6k7 = vget_high_u32(k4k5k6k7);  uint32x2_t n1n0 = vext_u32(n0n1,n0n1,1);  uint32x2_t n3n2;  uint32x2_t n0k4 = vext_u32(n1n0,k4k5,1);  uint32x2_t k5k0 = vext_u32(k4k5,k0k1,1);  uint32x2_t k1n1 = vext_u32(k0k1,n1n0,1);  uint32x2_t n2k6;  uint32x2_t k7k2 = vext_u32(k6k7,k2k3,1);  uint32x2_t k3n3;  uint32x4_t start1 = vcombine_u32(k5k0,n0k4);  uint32x4_t start2;  uint32x4_t start3;  register uint32x4_t diag0;  register uint32x4_t diag1;  register uint32x4_t diag2;  register uint32x4_t diag3;  uint32x4_t next_start2;  uint32x4_t next_start3;  register uint32x4_t next_diag0;  register uint32x4_t next_diag1;  register uint32x4_t next_diag2;  register uint32x4_t next_diag3;  uint32x4_t x0x5x10x15;  uint32x4_t x12x1x6x11;  uint32x4_t x8x13x2x7;  uint32x4_t x4x9x14x3;  uint32x4_t x0x1x10x11;  uint32x4_t x12x13x6x7;  uint32x4_t x8x9x2x3;  uint32x4_t x4x5x14x15;  uint32x4_t x0x1x2x3;  uint32x4_t x4x5x6x7;  uint32x4_t x8x9x10x11;  uint32x4_t x12x13x14x15;  uint32x4_t m0m1m2m3;  uint32x4_t m4m5m6m7;  uint32x4_t m8m9m10m11;  uint32x4_t m12m13m14m15;  register uint32x4_t a0;  register uint32x4_t a1;  register uint32x4_t a2;  register uint32x4_t a3;  register uint32x4_t b0;  register uint32x4_t b1;  register uint32x4_t b2;  register uint32x4_t b3;  register uint32x4_t next_a0;  register uint32x4_t next_a1;  register uint32x4_t next_a2;  register uint32x4_t next_a3;  register uint32x4_t next_b0;  register uint32x4_t next_b1;  register uint32x4_t next_b2;  register uint32x4_t next_b3;  unsigned char block[64];  unsigned char *savec;  int i;  int flagm = (m != 0);  if (!mlen) return 0;  if (mlen < 128) goto mlenatleast1;  mlenatleast128:  n3n2 = vext_u32(n2n3,n2n3,1);  n2k6 = vext_u32(n3n2,k6k7,1);  k3n3 = vext_u32(k2k3,n3n2,1);  start2 = vcombine_u32(n2k6,k1n1);  start3 = vcombine_u32(k3n3,k7k2);  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);  diag0 = start0;  diag1 = start1;  diag2 = start2;  diag3 = start3;  n3n2 = vext_u32(n2n3,n2n3,1);  n2k6 = vext_u32(n3n2,k6k7,1);  k3n3 = vext_u32(k2k3,n3n2,1);  next_start2 = vcombine_u32(n2k6,k1n1);  next_start3 = vcombine_u32(k3n3,k7k2);//.........这里部分代码省略.........
开发者ID:BurnBeforeReading,项目名称:cjdns,代码行数:101,


示例29: ne10_img_hresize_4channels_linear_neon

void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count,        const int* xofs, const short* alpha,        int swidth, int dwidth, int cn, int xmin, int xmax){    int dx, k;    int dx0 = 0;    int16x4x2_t alpha_vec;    uint8x8_t dS0_vec, dS1_vec;    int16x8_t qS0_vec, qS1_vec;    int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567;    int32x4_t qT0_vec, qT1_vec;    int16x4_t dCoeff;    dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE);    for (k = 0; k <= count - 2; k++)    {        const unsigned char *S0 = src[k], *S1 = src[k + 1];        int *D0 = dst[k], *D1 = dst[k + 1];        for (dx = dx0; dx < xmax; dx += 4)        {            int sx = xofs[dx];            alpha_vec = vld2_s16 (&alpha[dx * 2]);            dS0_vec = vld1_u8 (&S0[sx]);            dS1_vec = vld1_u8 (&S1[sx]);            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));            dS0_0123 = vget_low_s16 (qS0_vec);            dS0_4567 = vget_high_s16 (qS0_vec);            dS1_0123 = vget_low_s16 (qS1_vec);            dS1_4567 = vget_high_s16 (qS1_vec);            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);            qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]);            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);            qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]);            vst1q_s32 (&D0[dx], qT0_vec);            vst1q_s32 (&D1[dx], qT1_vec);        }        for (; dx < dwidth; dx += 4)        {            int sx = xofs[dx];            dS0_vec = vld1_u8 (&S0[sx]);            dS1_vec = vld1_u8 (&S1[sx]);            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));            dS0_0123 = vget_low_s16 (qS0_vec);            dS1_0123 = vget_low_s16 (qS1_vec);            qT0_vec = vmull_s16 (dS0_0123, dCoeff);            qT1_vec = vmull_s16 (dS1_0123, dCoeff);            vst1q_s32 (&D0[dx], qT0_vec);            vst1q_s32 (&D1[dx], qT1_vec);        }    }    for (; k < count; k++)    {        const unsigned char *S = src[k];        int *D = dst[k];        for (dx = 0; dx < xmax; dx += 4)        {            int sx = xofs[dx];            alpha_vec = vld2_s16 (&alpha[dx * 2]);            dS0_vec = vld1_u8 (&S[sx]);            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));            dS0_0123 = vget_low_s16 (qS0_vec);            dS0_4567 = vget_high_s16 (qS0_vec);            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);            vst1q_s32 (&D[dx], qT0_vec);        }        for (; dx < dwidth; dx += 4)        {            int sx = xofs[dx];            dS0_vec = vld1_u8 (&S[sx]);            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));            dS0_0123 = vget_low_s16 (qS0_vec);            qT0_vec = vmull_s16 (dS0_0123, dCoeff);//.........这里部分代码省略.........
开发者ID:HarveyLiuFly,项目名称:Ne10,代码行数:101,


示例30: ne10_img_vresize_linear_neon

void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width){    const int *S0 = src[0], *S1 = src[1];    int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;    int32x4_t qT_0123, qT_4567;    int16x4_t dT_0123, dT_4567;    uint16x8_t qT_01234567;    uint8x8_t dT_01234567, dDst_01234567;    int32x2_t dBeta;    dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0);    dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1);    int32x4_t qDelta, qMin, qMax;    qDelta = vdupq_n_s32 (DELTA);    qMin = vdupq_n_s32 (0);    qMax = vdupq_n_s32 (255);    int x = 0;    for (; x <= width - 8; x += 8)    {        qS0_0123 = vld1q_s32 (&S0[x]);        qS0_4567 = vld1q_s32 (&S0[x + 4]);        qS1_0123 = vld1q_s32 (&S1[x]);        qS1_4567 = vld1q_s32 (&S1[x + 4]);        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);        qT_0123 = vaddq_s32 (qT_0123, qDelta);        qT_4567 = vaddq_s32 (qT_4567, qDelta);        qT_0123 = vshrq_n_s32 (qT_0123, BITS);        qT_4567 = vshrq_n_s32 (qT_4567, BITS);        qT_0123 = vmaxq_s32 (qT_0123, qMin);        qT_4567 = vmaxq_s32 (qT_4567, qMin);        qT_0123 = vminq_s32 (qT_0123, qMax);        qT_4567 = vminq_s32 (qT_4567, qMax);        dT_0123 = vmovn_s32 (qT_0123);        dT_4567 = vmovn_s32 (qT_4567);        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));        dT_01234567 = vmovn_u16 (qT_01234567);        vst1_u8 (&dst[x], dT_01234567);    }    if (x < width)    {        uint8x8_t dMask;        dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)]));        dDst_01234567 = vld1_u8 (&dst[x]);        qS0_0123 = vld1q_s32 (&S0[x]);        qS0_4567 = vld1q_s32 (&S0[x + 4]);        qS1_0123 = vld1q_s32 (&S1[x]);        qS1_4567 = vld1q_s32 (&S1[x + 4]);        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);        qT_0123 = vaddq_s32 (qT_0123, qDelta);        qT_4567 = vaddq_s32 (qT_4567, qDelta);        qT_0123 = vshrq_n_s32 (qT_0123, BITS);        qT_4567 = vshrq_n_s32 (qT_4567, BITS);        qT_0123 = vmaxq_s32 (qT_0123, qMin);        qT_4567 = vmaxq_s32 (qT_4567, qMin);        qT_0123 = vminq_s32 (qT_0123, qMax);        qT_4567 = vminq_s32 (qT_4567, qMax);        dT_0123 = vmovn_s32 (qT_0123);        dT_4567 = vmovn_s32 (qT_4567);        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));        dT_01234567 = vmovn_u16 (qT_01234567);        dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567);        vst1_u8 (&dst[x], dMask);    }}
开发者ID:HarveyLiuFly,项目名称:Ne10,代码行数:87,



注:本文中的vld1_u8函数示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


C++ vld1q_f32函数代码示例
C++ vlclua_get_this函数代码示例
万事OK自学网:51自学网_软件自学网_CAD自学网自学excel、自学PS、自学CAD、自学C语言、自学css3实例,是一个通过网络自主学习工作技能的自学平台,网友喜欢的软件自学网站。