1 int i; 2 float64x2_t sum0 = vdupq_n_f64(0.0f); 3 float64x2_t sum1 = vdupq_n_f64(0.0f); 4 float64x2_t sum2 = vdupq_n_f64(0.0f); 5 float64x2_t sum3 = vdupq_n_f64(0.0f); 6 float64x2_t d0 = vdupq_n_f64(0.0f); 7 float64x2_t d1 = vdupq_n_f64(0.0f); 8 float64x2_t d2 = vdupq_n_f64(0.0f); 9 float64x2_t d3 = vdupq_n_f64(0.0f); 10 #if MAX_LAG > 8 11 float64x2_t sum4 = vdupq_n_f64(0.0f); 12 float64x2_t d4 = vdupq_n_f64(0.0f); 13 #endif 14 #if MAX_LAG > 10 15 float64x2_t sum5 = vdupq_n_f64(0.0f); 16 float64x2_t sum6 = vdupq_n_f64(0.0f); 17 float64x2_t d5 = vdupq_n_f64(0.0f); 18 float64x2_t d6 = vdupq_n_f64(0.0f); 19 #endif 20 float64x2_t d; 21 22 (void)lag; 23 FLAC__ASSERT(lag <= MAX_LAG); 24 25 // Loop backwards through samples from data_len to 0 26 for (i = data_len - 1; i >= 0; i--) 27 { 28 d = vdupq_n_f64(data[i]); // Create vector with 2 entries data[i] 29 30 // The next 6 lines of code right-shift the elements through the 7 vectors d0..d6. 31 // The 7th line adds the newly loaded element to d0. This works like a stack, where 32 // data[i] is pushed onto the stack every time and the 9th element falls off 33 #if MAX_LAG > 10 34 d6 = vextq_f64(d5,d6,1); 35 d5 = vextq_f64(d4,d5,1); 36 #endif 37 #if MAX_LAG > 8 38 d4 = vextq_f64(d3,d4,1); 39 #endif 40 d3 = vextq_f64(d2,d3,1); 41 d2 = vextq_f64(d1,d2,1); 42 d1 = vextq_f64(d0,d1,1); 43 d0 = vextq_f64(d,d0,1); 44 45 // Fused multiply-add sum += d * d0..d6 46 sum0 = vfmaq_f64(sum0, d, d0); 47 sum1 = vfmaq_f64(sum1, d, d1); 48 sum2 = vfmaq_f64(sum2, d, d2); 49 sum3 = vfmaq_f64(sum3, d, d3); 50 #if MAX_LAG > 8 51 sum4 = vfmaq_f64(sum4, d, d4); 52 #endif 53 #if MAX_LAG > 10 54 sum5 = vfmaq_f64(sum5, d, d5); 55 sum6 = vfmaq_f64(sum6, d, d6); 56 #endif 57 } 58 59 // Store sum0..sum6 in autoc[0..14] 60 vst1q_f64(autoc, sum0); 61 vst1q_f64(autoc + 2, sum1); 62 vst1q_f64(autoc + 4, sum2); 63 vst1q_f64(autoc + 6, sum3); 64 #if MAX_LAG > 8 65 vst1q_f64(autoc + 8, sum4); 66 #endif 67 #if MAX_LAG > 10 68 vst1q_f64(autoc + 10, sum5); 69 vst1q_f64(autoc + 12, sum6); 70 #endif 71