xref: /aosp_15_r20/external/gemmlowp/internal/pack_avx.h (revision 5f39d1b313f0528e11bae88b3029b54b9e1033e7)
1*5f39d1b3SJooyung Han // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2*5f39d1b3SJooyung Han //
3*5f39d1b3SJooyung Han // Licensed under the Apache License, Version 2.0 (the "License");
4*5f39d1b3SJooyung Han // you may not use this file except in compliance with the License.
5*5f39d1b3SJooyung Han // You may obtain a copy of the License at
6*5f39d1b3SJooyung Han //
7*5f39d1b3SJooyung Han //     http://www.apache.org/licenses/LICENSE-2.0
8*5f39d1b3SJooyung Han //
9*5f39d1b3SJooyung Han // Unless required by applicable law or agreed to in writing, software
10*5f39d1b3SJooyung Han // distributed under the License is distributed on an "AS IS" BASIS,
11*5f39d1b3SJooyung Han // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*5f39d1b3SJooyung Han // See the License for the specific language governing permissions and
13*5f39d1b3SJooyung Han // limitations under the License.
14*5f39d1b3SJooyung Han 
15*5f39d1b3SJooyung Han // pack_avx.h: optimized AVX specializations of the templates in pack.h.
16*5f39d1b3SJooyung Han 
17*5f39d1b3SJooyung Han #ifndef GEMMLOWP_INTERNAL_PACK_AVX_H_
18*5f39d1b3SJooyung Han #define GEMMLOWP_INTERNAL_PACK_AVX_H_
19*5f39d1b3SJooyung Han 
20*5f39d1b3SJooyung Han #include <immintrin.h>
21*5f39d1b3SJooyung Han #include "pack.h"
22*5f39d1b3SJooyung Han 
23*5f39d1b3SJooyung Han namespace gemmlowp {
24*5f39d1b3SJooyung Han 
25*5f39d1b3SJooyung Han // TODO: Add DepthMajorUint8SideMap
26*5f39d1b3SJooyung Han 
27*5f39d1b3SJooyung Han typedef SideMap<const std::uint8_t, SideMapOrder::WidthMajor>
28*5f39d1b3SJooyung Han     WidthMajorUint8SideMap;
29*5f39d1b3SJooyung Han 
30*5f39d1b3SJooyung Han template <int Cells>
31*5f39d1b3SJooyung Han using WidthMajorSideFormatNCells4x2 =
32*5f39d1b3SJooyung Han     KernelSideFormat<CellFormat<8, 2, CellOrder::WidthMajor>, Cells>;
33*5f39d1b3SJooyung Han 
34*5f39d1b3SJooyung Han template <int Cells>
35*5f39d1b3SJooyung Han class PackingRegisterBlock<
36*5f39d1b3SJooyung Han     WidthMajorUint8SideMap,
37*5f39d1b3SJooyung Han     PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells>>>
38*5f39d1b3SJooyung Han     : public PackingRegisterBlockBase<
39*5f39d1b3SJooyung Han           WidthMajorUint8SideMap,
40*5f39d1b3SJooyung Han           PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells>>> {
41*5f39d1b3SJooyung Han  public:
42*5f39d1b3SJooyung Han   typedef WidthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
43*5f39d1b3SJooyung Han   typedef typename KernelSideFormat::Cell CellFormat;
44*5f39d1b3SJooyung Han   static const int kCells = KernelSideFormat::kCells;
45*5f39d1b3SJooyung Han   static const int kCellWidth = CellFormat::kWidth;
46*5f39d1b3SJooyung Han   static const int kKernelWidth = CellFormat::kWidth * kCells;
47*5f39d1b3SJooyung Han   static const int kCellDepth = CellFormat::kDepth;
48*5f39d1b3SJooyung Han   static const int kCellSize = CellFormat::kSize;
49*5f39d1b3SJooyung Han 
Pack(PackedSideBlock<KernelSideFormat> * dst,int start_width)50*5f39d1b3SJooyung Han   void Pack(PackedSideBlock<KernelSideFormat> *dst, int start_width) {
51*5f39d1b3SJooyung Han     std::uint8_t *dst_ptr = dst->current_data();
52*5f39d1b3SJooyung Han     const int width_stride = this->complete_src_.width_stride();
53*5f39d1b3SJooyung Han     int depth_step = 16;
54*5f39d1b3SJooyung Han 
55*5f39d1b3SJooyung Han     __m256i one = _mm256_set1_epi16(1);
56*5f39d1b3SJooyung Han     for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
57*5f39d1b3SJooyung Han          cell_start_depth += depth_step) {
58*5f39d1b3SJooyung Han       for (int cell_start_width = 0; cell_start_width < kKernelWidth;
59*5f39d1b3SJooyung Han            cell_start_width += kCellWidth) {
60*5f39d1b3SJooyung Han         std::int32_t *cell_sums_of_each_slice_ptr =
61*5f39d1b3SJooyung Han             dst->sums_of_each_slice() + start_width + cell_start_width;
62*5f39d1b3SJooyung Han         const std::uint8_t *src_data =
63*5f39d1b3SJooyung Han             this->complete_src_.data(cell_start_width, cell_start_depth);
64*5f39d1b3SJooyung Han 
65*5f39d1b3SJooyung Han         __m128i xmm1 =
66*5f39d1b3SJooyung Han             _mm_loadu_si128(reinterpret_cast<const __m128i *>(&src_data[0]));
67*5f39d1b3SJooyung Han         __m128i xmm2 = _mm_loadu_si128(
68*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[1 * width_stride]));
69*5f39d1b3SJooyung Han         __m128i xmm3 = _mm_loadu_si128(
70*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[2 * width_stride]));
71*5f39d1b3SJooyung Han         __m128i xmm4 = _mm_loadu_si128(
72*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[3 * width_stride]));
73*5f39d1b3SJooyung Han         __m128i xmm5 = _mm_loadu_si128(
74*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[4 * width_stride]));
75*5f39d1b3SJooyung Han         __m128i xmm6 = _mm_loadu_si128(
76*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[5 * width_stride]));
77*5f39d1b3SJooyung Han         __m128i xmm7 = _mm_loadu_si128(
78*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[6 * width_stride]));
79*5f39d1b3SJooyung Han         __m128i xmm8 = _mm_loadu_si128(
80*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[7 * width_stride]));
81*5f39d1b3SJooyung Han 
82*5f39d1b3SJooyung Han         __m256i ymm1 = _mm256_set_m128i(xmm5, xmm1);
83*5f39d1b3SJooyung Han         __m256i ymm2 = _mm256_set_m128i(xmm6, xmm2);
84*5f39d1b3SJooyung Han         __m256i ymm3 = _mm256_set_m128i(xmm7, xmm3);
85*5f39d1b3SJooyung Han         __m256i ymm4 = _mm256_set_m128i(xmm8, xmm4);
86*5f39d1b3SJooyung Han 
87*5f39d1b3SJooyung Han         __m256i ymm5 = _mm256_unpacklo_epi16(ymm1, ymm2);
88*5f39d1b3SJooyung Han         __m256i ymm6 = _mm256_unpacklo_epi16(ymm3, ymm4);
89*5f39d1b3SJooyung Han 
90*5f39d1b3SJooyung Han         __m256i ymm9 = _mm256_unpackhi_epi16(ymm1, ymm2);
91*5f39d1b3SJooyung Han         __m256i ymm10 = _mm256_unpackhi_epi16(ymm3, ymm4);
92*5f39d1b3SJooyung Han 
93*5f39d1b3SJooyung Han         __m256i ymm7 = _mm256_unpacklo_epi32(ymm5, ymm6);
94*5f39d1b3SJooyung Han         __m256i ymm8 = _mm256_unpackhi_epi32(ymm5, ymm6);
95*5f39d1b3SJooyung Han 
96*5f39d1b3SJooyung Han         __m256i ymm13 = _mm256_unpacklo_epi32(ymm9, ymm10);
97*5f39d1b3SJooyung Han         __m256i ymm14 = _mm256_unpackhi_epi32(ymm9, ymm10);
98*5f39d1b3SJooyung Han 
99*5f39d1b3SJooyung Han         __m256i ymm11 = _mm256_permute4x64_epi64(ymm7, 0xd8);
100*5f39d1b3SJooyung Han         __m256i ymm12 = _mm256_permute4x64_epi64(ymm8, 0xd8);
101*5f39d1b3SJooyung Han 
102*5f39d1b3SJooyung Han         __m256i ymm15 = _mm256_permute4x64_epi64(ymm13, 0xd8);
103*5f39d1b3SJooyung Han         __m256i ymm16 = _mm256_permute4x64_epi64(ymm14, 0xd8);
104*5f39d1b3SJooyung Han 
105*5f39d1b3SJooyung Han         __m128i xmm9 = _mm256_castsi256_si128(ymm11);
106*5f39d1b3SJooyung Han         __m128i xmm10 = _mm256_castsi256_si128(ymm12);
107*5f39d1b3SJooyung Han         __m128i xmm11 = _mm256_extracti128_si256(ymm11, 1);
108*5f39d1b3SJooyung Han         __m128i xmm12 = _mm256_extracti128_si256(ymm12, 1);
109*5f39d1b3SJooyung Han 
110*5f39d1b3SJooyung Han         xmm1 = _mm256_castsi256_si128(ymm15);
111*5f39d1b3SJooyung Han         xmm2 = _mm256_castsi256_si128(ymm16);
112*5f39d1b3SJooyung Han         xmm3 = _mm256_extracti128_si256(ymm15, 1);
113*5f39d1b3SJooyung Han         xmm4 = _mm256_extracti128_si256(ymm16, 1);
114*5f39d1b3SJooyung Han 
115*5f39d1b3SJooyung Han         _mm_storeu_si128(reinterpret_cast<__m128i *>(&dst_ptr[0]), xmm9);
116*5f39d1b3SJooyung Han         _mm_storeu_si128(
117*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[kCellSize * kCells]), xmm11);
118*5f39d1b3SJooyung Han         _mm_storeu_si128(
119*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[2 * kCellSize * kCells]),
120*5f39d1b3SJooyung Han             xmm10);
121*5f39d1b3SJooyung Han         _mm_storeu_si128(
122*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[3 * kCellSize * kCells]),
123*5f39d1b3SJooyung Han             xmm12);
124*5f39d1b3SJooyung Han         _mm_storeu_si128(
125*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[4 * kCellSize * kCells]),
126*5f39d1b3SJooyung Han             xmm1);
127*5f39d1b3SJooyung Han         _mm_storeu_si128(
128*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[5 * kCellSize * kCells]),
129*5f39d1b3SJooyung Han             xmm3);
130*5f39d1b3SJooyung Han 
131*5f39d1b3SJooyung Han         _mm_storeu_si128(
132*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[6 * kCellSize * kCells]),
133*5f39d1b3SJooyung Han             xmm2);
134*5f39d1b3SJooyung Han         _mm_storeu_si128(
135*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[7 * kCellSize * kCells]),
136*5f39d1b3SJooyung Han             xmm4);
137*5f39d1b3SJooyung Han 
138*5f39d1b3SJooyung Han         ymm6 = _mm256_cvtepu8_epi16(xmm9);
139*5f39d1b3SJooyung Han         ymm7 = _mm256_madd_epi16(ymm6, one);
140*5f39d1b3SJooyung Han         __m256i sums_of_each_slice_xmm = _mm256_loadu_si256(
141*5f39d1b3SJooyung Han             reinterpret_cast<const __m256i *>(&cell_sums_of_each_slice_ptr[0]));
142*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
143*5f39d1b3SJooyung Han 
144*5f39d1b3SJooyung Han         ymm6 = _mm256_cvtepu8_epi16(xmm11);
145*5f39d1b3SJooyung Han         ymm7 = _mm256_madd_epi16(ymm6, one);
146*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
147*5f39d1b3SJooyung Han 
148*5f39d1b3SJooyung Han         ymm6 = _mm256_cvtepu8_epi16(xmm10);
149*5f39d1b3SJooyung Han         ymm7 = _mm256_madd_epi16(ymm6, one);
150*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
151*5f39d1b3SJooyung Han 
152*5f39d1b3SJooyung Han         ymm6 = _mm256_cvtepu8_epi16(xmm12);
153*5f39d1b3SJooyung Han         ymm7 = _mm256_madd_epi16(ymm6, one);
154*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
155*5f39d1b3SJooyung Han 
156*5f39d1b3SJooyung Han         ymm6 = _mm256_cvtepu8_epi16(xmm1);
157*5f39d1b3SJooyung Han         ymm7 = _mm256_madd_epi16(ymm6, one);
158*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
159*5f39d1b3SJooyung Han 
160*5f39d1b3SJooyung Han         ymm6 = _mm256_cvtepu8_epi16(xmm3);
161*5f39d1b3SJooyung Han         ymm7 = _mm256_madd_epi16(ymm6, one);
162*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
163*5f39d1b3SJooyung Han 
164*5f39d1b3SJooyung Han         ymm6 = _mm256_cvtepu8_epi16(xmm2);
165*5f39d1b3SJooyung Han         ymm7 = _mm256_madd_epi16(ymm6, one);
166*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
167*5f39d1b3SJooyung Han 
168*5f39d1b3SJooyung Han         ymm6 = _mm256_cvtepu8_epi16(xmm4);
169*5f39d1b3SJooyung Han         ymm7 = _mm256_madd_epi16(ymm6, one);
170*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
171*5f39d1b3SJooyung Han 
172*5f39d1b3SJooyung Han         _mm256_storeu_si256(
173*5f39d1b3SJooyung Han             reinterpret_cast<__m256i *>(&cell_sums_of_each_slice_ptr[0]),
174*5f39d1b3SJooyung Han             sums_of_each_slice_xmm);
175*5f39d1b3SJooyung Han         dst_ptr += kCellSize;
176*5f39d1b3SJooyung Han       }
177*5f39d1b3SJooyung Han       dst_ptr += 7 * kCellSize * kCells;
178*5f39d1b3SJooyung Han     }
179*5f39d1b3SJooyung Han     dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
180*5f39d1b3SJooyung Han   }
181*5f39d1b3SJooyung Han };
182*5f39d1b3SJooyung Han 
183*5f39d1b3SJooyung Han // Pack format for 4x2 rhs format
184*5f39d1b3SJooyung Han template <int Cells>
185*5f39d1b3SJooyung Han using RhsWidthMajorSideFormatNCells4x2 =
186*5f39d1b3SJooyung Han     KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, Cells>;
187*5f39d1b3SJooyung Han 
188*5f39d1b3SJooyung Han template <int Cells>
189*5f39d1b3SJooyung Han class PackingRegisterBlock<
190*5f39d1b3SJooyung Han     WidthMajorUint8SideMap,
191*5f39d1b3SJooyung Han     PackedSideBlock<RhsWidthMajorSideFormatNCells4x2<Cells>>>
192*5f39d1b3SJooyung Han     : public PackingRegisterBlockBase<
193*5f39d1b3SJooyung Han           WidthMajorUint8SideMap,
194*5f39d1b3SJooyung Han           PackedSideBlock<RhsWidthMajorSideFormatNCells4x2<Cells>>> {
195*5f39d1b3SJooyung Han  public:
196*5f39d1b3SJooyung Han   typedef RhsWidthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
197*5f39d1b3SJooyung Han   typedef typename KernelSideFormat::Cell CellFormat;
198*5f39d1b3SJooyung Han   static const int kCells = KernelSideFormat::kCells;
199*5f39d1b3SJooyung Han   static const int kCellWidth = CellFormat::kWidth;
200*5f39d1b3SJooyung Han   static const int kKernelWidth = CellFormat::kWidth * kCells;
201*5f39d1b3SJooyung Han   static const int kCellDepth = CellFormat::kDepth;
202*5f39d1b3SJooyung Han   static const int kCellSize = CellFormat::kSize;
203*5f39d1b3SJooyung Han 
Pack(PackedSideBlock<KernelSideFormat> * dst,int start_width)204*5f39d1b3SJooyung Han   void Pack(PackedSideBlock<KernelSideFormat> *dst, int start_width) {
205*5f39d1b3SJooyung Han     std::uint8_t *dst_ptr = dst->current_data();
206*5f39d1b3SJooyung Han     const int width_stride = this->complete_src_.width_stride();
207*5f39d1b3SJooyung Han     int depth_step = 8;
208*5f39d1b3SJooyung Han 
209*5f39d1b3SJooyung Han     __m128i one = _mm_set1_epi16(1);
210*5f39d1b3SJooyung Han     for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
211*5f39d1b3SJooyung Han          cell_start_depth += depth_step) {
212*5f39d1b3SJooyung Han       for (int cell_start_width = 0; cell_start_width < kKernelWidth;
213*5f39d1b3SJooyung Han            cell_start_width += kCellWidth) {
214*5f39d1b3SJooyung Han         std::int32_t *cell_sums_of_each_slice_ptr =
215*5f39d1b3SJooyung Han             dst->sums_of_each_slice() + start_width + cell_start_width;
216*5f39d1b3SJooyung Han         const std::uint8_t *src_data =
217*5f39d1b3SJooyung Han             this->complete_src_.data(cell_start_width, cell_start_depth);
218*5f39d1b3SJooyung Han 
219*5f39d1b3SJooyung Han         __m128i xmm1 =
220*5f39d1b3SJooyung Han             _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src_data[0]));
221*5f39d1b3SJooyung Han         __m128i xmm2 = _mm_loadl_epi64(
222*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[1 * width_stride]));
223*5f39d1b3SJooyung Han         __m128i xmm3 = _mm_loadl_epi64(
224*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[2 * width_stride]));
225*5f39d1b3SJooyung Han         __m128i xmm4 = _mm_loadl_epi64(
226*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&src_data[3 * width_stride]));
227*5f39d1b3SJooyung Han 
228*5f39d1b3SJooyung Han         __m128i xmm5 = _mm_unpacklo_epi16(xmm1, xmm2);
229*5f39d1b3SJooyung Han         __m128i xmm8 = _mm_shuffle_epi32(xmm5, 0x31);
230*5f39d1b3SJooyung Han 
231*5f39d1b3SJooyung Han         __m128i xmm6 = _mm_unpacklo_epi16(xmm3, xmm4);
232*5f39d1b3SJooyung Han         __m128i xmm7 = _mm_shuffle_epi32(xmm6, 0x80);
233*5f39d1b3SJooyung Han 
234*5f39d1b3SJooyung Han         __m128i xmm9 = _mm_blend_epi16(xmm5, xmm7, 0xcc);
235*5f39d1b3SJooyung Han         __m128i xmm10 = _mm_blend_epi16(xmm8, xmm6, 0xcc);
236*5f39d1b3SJooyung Han 
237*5f39d1b3SJooyung Han         _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst_ptr[0]), xmm9);
238*5f39d1b3SJooyung Han         _mm_storel_epi64(
239*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[kCellSize * kCells]), xmm10);
240*5f39d1b3SJooyung Han 
241*5f39d1b3SJooyung Han         __m128i xmm11 = _mm_shuffle_epi32(xmm9, 0xee);
242*5f39d1b3SJooyung Han         __m128i xmm12 = _mm_shuffle_epi32(xmm10, 0xee);
243*5f39d1b3SJooyung Han 
244*5f39d1b3SJooyung Han         _mm_storel_epi64(
245*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[2 * kCellSize * kCells]),
246*5f39d1b3SJooyung Han             xmm11);
247*5f39d1b3SJooyung Han         _mm_storel_epi64(
248*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&dst_ptr[3 * kCellSize * kCells]),
249*5f39d1b3SJooyung Han             xmm12);
250*5f39d1b3SJooyung Han 
251*5f39d1b3SJooyung Han         xmm1 = _mm_cvtepu8_epi16(xmm9);
252*5f39d1b3SJooyung Han         xmm2 = _mm_madd_epi16(xmm1, one);
253*5f39d1b3SJooyung Han         __m128i sums_of_each_slice_xmm = _mm_loadu_si128(
254*5f39d1b3SJooyung Han             reinterpret_cast<const __m128i *>(&cell_sums_of_each_slice_ptr[0]));
255*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
256*5f39d1b3SJooyung Han 
257*5f39d1b3SJooyung Han         xmm1 = _mm_cvtepu8_epi16(xmm10);
258*5f39d1b3SJooyung Han         xmm2 = _mm_madd_epi16(xmm1, one);
259*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
260*5f39d1b3SJooyung Han 
261*5f39d1b3SJooyung Han         xmm1 = _mm_cvtepu8_epi16(xmm11);
262*5f39d1b3SJooyung Han         xmm2 = _mm_madd_epi16(xmm1, one);
263*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
264*5f39d1b3SJooyung Han 
265*5f39d1b3SJooyung Han         xmm1 = _mm_cvtepu8_epi16(xmm12);
266*5f39d1b3SJooyung Han         xmm2 = _mm_madd_epi16(xmm1, one);
267*5f39d1b3SJooyung Han         sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
268*5f39d1b3SJooyung Han 
269*5f39d1b3SJooyung Han         _mm_storeu_si128(
270*5f39d1b3SJooyung Han             reinterpret_cast<__m128i *>(&cell_sums_of_each_slice_ptr[0]),
271*5f39d1b3SJooyung Han             sums_of_each_slice_xmm);
272*5f39d1b3SJooyung Han         dst_ptr += kCellSize;
273*5f39d1b3SJooyung Han       }
274*5f39d1b3SJooyung Han       dst_ptr += 3 * kCellSize * kCells;
275*5f39d1b3SJooyung Han     }
276*5f39d1b3SJooyung Han     dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
277*5f39d1b3SJooyung Han   }
278*5f39d1b3SJooyung Han };
279*5f39d1b3SJooyung Han 
280*5f39d1b3SJooyung Han }  // namespace gemmlowp
281*5f39d1b3SJooyung Han 
282*5f39d1b3SJooyung Han #endif  // GEMMLOWP_INTERNAL_PACK_AVX_H_
283