1*4bdc9457SAndroid Build Coastguard Worker // Copyright (c) Facebook, Inc. and its affiliates. 2*4bdc9457SAndroid Build Coastguard Worker // All rights reserved. 3*4bdc9457SAndroid Build Coastguard Worker // 4*4bdc9457SAndroid Build Coastguard Worker // Copyright 2019 Google LLC 5*4bdc9457SAndroid Build Coastguard Worker // 6*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the 7*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree. 8*4bdc9457SAndroid Build Coastguard Worker 9*4bdc9457SAndroid Build Coastguard Worker #include <arm_neon.h> 10*4bdc9457SAndroid Build Coastguard Worker 11*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/zip.h> 12*4bdc9457SAndroid Build Coastguard Worker 13*4bdc9457SAndroid Build Coastguard Worker xnn_x8_zip_x4_ukernel__neon(size_t n,const uint8_t * input,uint8_t * output)14*4bdc9457SAndroid Build Coastguard Workervoid xnn_x8_zip_x4_ukernel__neon( 15*4bdc9457SAndroid Build Coastguard Worker size_t n, 16*4bdc9457SAndroid Build Coastguard Worker const uint8_t* input, 17*4bdc9457SAndroid Build Coastguard Worker uint8_t* output) 18*4bdc9457SAndroid Build Coastguard Worker { 19*4bdc9457SAndroid Build Coastguard Worker const uint8_t* x = input; 20*4bdc9457SAndroid Build Coastguard Worker const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); 21*4bdc9457SAndroid Build Coastguard Worker const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); 22*4bdc9457SAndroid Build Coastguard Worker const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n); 23*4bdc9457SAndroid Build Coastguard Worker uint8_t* o = output; 24*4bdc9457SAndroid Build Coastguard Worker 25*4bdc9457SAndroid Build Coastguard Worker if (n >= 8) { 26*4bdc9457SAndroid Build Coastguard Worker do { 27*4bdc9457SAndroid Build Coastguard Worker uint8x8x4_t vxyzw; 28*4bdc9457SAndroid Build Coastguard Worker vxyzw.val[0] = vld1_u8(x); x += 8; 29*4bdc9457SAndroid Build Coastguard Worker vxyzw.val[1] = vld1_u8(y); y += 8; 30*4bdc9457SAndroid Build Coastguard Worker vxyzw.val[2] = vld1_u8(z); z += 8; 31*4bdc9457SAndroid Build Coastguard Worker vxyzw.val[3] = vld1_u8(w); w += 8; 32*4bdc9457SAndroid Build Coastguard Worker vst4_u8(o, vxyzw); o += 32; 33*4bdc9457SAndroid Build Coastguard Worker n -= 8; 34*4bdc9457SAndroid Build Coastguard Worker } while (n >= 8); 35*4bdc9457SAndroid Build Coastguard Worker if (n != 0) { 36*4bdc9457SAndroid Build Coastguard Worker const size_t address_increment = n - 8; 37*4bdc9457SAndroid Build Coastguard Worker uint8x8x4_t vxyzw; 38*4bdc9457SAndroid Build Coastguard Worker vxyzw.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment)); 39*4bdc9457SAndroid Build Coastguard Worker vxyzw.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment)); 40*4bdc9457SAndroid Build Coastguard Worker vxyzw.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment)); 41*4bdc9457SAndroid Build Coastguard Worker vxyzw.val[3] = vld1_u8((const uint8_t*) ((uintptr_t) w + address_increment)); 42*4bdc9457SAndroid Build Coastguard Worker vst4_u8((uint8_t*) ((uintptr_t) o + address_increment * 4), vxyzw); 43*4bdc9457SAndroid Build Coastguard Worker } 44*4bdc9457SAndroid Build Coastguard Worker } else { 45*4bdc9457SAndroid Build Coastguard Worker do { 46*4bdc9457SAndroid Build Coastguard Worker const uint8_t vx = *x++; 47*4bdc9457SAndroid Build Coastguard Worker const uint8_t vy = *y++; 48*4bdc9457SAndroid Build Coastguard Worker const uint8_t vz = *z++; 49*4bdc9457SAndroid Build Coastguard Worker const uint8_t vw = *w++; 50*4bdc9457SAndroid Build Coastguard Worker o[0] = vx; 51*4bdc9457SAndroid Build Coastguard Worker o[1] = vy; 52*4bdc9457SAndroid Build Coastguard Worker o[2] = vz; 53*4bdc9457SAndroid Build Coastguard Worker o[3] = vw; 54*4bdc9457SAndroid Build Coastguard Worker o += 4; 55*4bdc9457SAndroid Build Coastguard Worker } while (--n != 0); 56*4bdc9457SAndroid Build Coastguard Worker } 57*4bdc9457SAndroid Build Coastguard Worker } 58