xref: /aosp_15_r20/external/libhevc/common/arm/ihevc_sao_edge_offset_class0.s (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class0.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset,
48@                              WORD32 wd,
49@                              WORD32 ht)
50@
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r7 =>  *pu1_avail
58@r8 =>  *pi1_sao_offset
59@r9 =>  wd
60@r10=>  ht
61
62.equ    pu1_src_top_left_offset,    104
63.equ    pu1_src_top_right_offset,   108
64.equ    pu1_src_bot_left_offset,    112
65.equ    pu1_avail_offset,           116
66.equ    pi1_sao_offset,             120
67.equ    wd_offset,                  124
68.equ    ht_offset,                  128
69
70.text
71.p2align 2
72
73.extern gi1_table_edge_idx
74.globl ihevc_sao_edge_offset_class0_a9q
75
76gi1_table_edge_idx_addr:
77.long gi1_table_edge_idx - ulbl1 - 8
78
79ihevc_sao_edge_offset_class0_a9q:
80
81
82    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
83    vpush       {d8  -  d15}
84
85    LDR         r9,[sp,#wd_offset]          @Loads wd
86
87    LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
88    VMOV.I8     Q1,#2                       @const_2 = vdupq_n_s8(2)
89    ADD         r11,r3,r9                   @pu1_src_top[wd]
90
91    LDR         r10,[sp,#ht_offset]         @Loads ht
92    VMOV.I16    Q2,#0                       @const_min_clip = vdupq_n_s16(0)
93    LDRB        r12,[r11,#-1]               @pu1_src_top[wd - 1]
94
95    LDR         r7,[sp,#pu1_avail_offset]   @Loads pu1_avail
96    VMOV.I16    Q3,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
97    LDR         r14, gi1_table_edge_idx_addr @table pointer
98ulbl1:
99    add         r14,r14,pc
100
101    LDR         r8,[sp,#pi1_sao_offset]     @Loads pi1_sao_offset
102    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
103    STRB        r12,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
104
105    MOV         r6,r0                       @pu1_src_org
106    VLD1.8      D10,[r14]                   @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
107    SUB         r4,r10,#1                   @(ht - 1)
108
109    MOV         r12,r9                      @Move wd to r12 for loop count
110    VLD1.8      D11,[r8]                    @offset_tbl = vld1_s8(pi1_sao_offset)
111    MUL         r4,r4,r1                    @(ht - 1) * src_strd
112
113    ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
114
115SRC_TOP_LOOP:                               @wd is always multiple of 8
116    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
117    SUBS        r12,r12,#8                  @Decrement the loop counter by 8
118    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
119    BNE         SRC_TOP_LOOP
120    ADD         r6,r6,#15                   @pu1_src_org[16 - 1]
121
122    CMP         r9,#16                      @Compare wd with 16
123    MOV         r3,r2                       @pu1_src_left backup to reload later
124    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
125
126    MOV         r8,r9                       @move wd to r8 for loop count
127
128WIDTH_LOOP_16:
129    CMP         r8,r9                       @if(col == wd)
130    BNE         AU1_MASK_FF                 @jump to else part
131    LDRB        r12,[r7]                    @pu1_avail[0]
132    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
133    B           SKIP_AU1_MASK_FF            @Skip the else part
134
135AU1_MASK_FF:
136    MOV         r12,#0xFF                   @move -1 to r12
137    VMOV.8      D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
138
139SKIP_AU1_MASK_FF:
140    CMP         r8,#16                      @If col == 16
141    BNE         SKIP_MASKING_IF_NOT16       @If not skip masking
142    LDRB        r12,[r7,#1]                 @pu1_avail[1]
143    VMOV.8      D9[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
144
145SKIP_MASKING_IF_NOT16:
146    MOV         r12,r0                      @pu1_src_cpy = pu1_src
147    MOV         r4,r10                      @move ht to r4 for loop count
148
149PU1_SRC_LOOP:
150    LDRB        r11,[r2]                    @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
151    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
152    VLD1.8      D13,[r12], r1               @pu1_cur_row = vld1q_u8(pu1_src_cpy)
153    SUB         r12,#8
154    SUB         r5,r9,r8                    @wd - col
155
156    SUB         r14,r10,r4                  @ht - row
157    VMOV.8      D15[7],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
158    MUL         r14,r14,r1                  @(ht - row) * src_strd
159
160    VLD1.8      D26,[r12]!                  @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
161    VLD1.8      D27,[r12]                   @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
162    SUB         r12,#8
163    VEXT.8      Q7,Q7,Q6,#15                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
164    ADD         r5,r14,r5                   @(ht - row) * src_strd + (wd - col)
165
166    LDRB        r11,[r2, #1]                @II Iteration load pu1_src_left since ht - row + 1 =1
167    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
168    LDRB        r14,[r6,r5]                 @pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
169
170    SUB         r4,r4,#1
171    VMOV.8      D29[7],r11                  @II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
172    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
173
174    SUB         r12,r12,r1                  @Decrement the pu1_src pointer by src_strd
175    VSUB.I8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
176    STRB        r14,[r2],#1                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
177
178    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
179    VEXT.8      Q14,Q14,Q13,#15             @II Iteration pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
180    SUB         r5,r9,r8                    @II wd - col
181
182    ADD         r12,r12,r1                  @Increment the pu1_src pointer by src_strd
183    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
184    VCGT.U8     Q15,Q13,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
185
186    LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
187    VEXT.8      Q7,Q6,Q7,#1                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
188    SUB         r14,r10,r4                  @II ht - row
189
190    VCLT.U8     Q0,Q13,Q14                  @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
191    VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
192    SUB         r12,r12,r1                  @Decrement the pu1_src pointer by src_strd
193
194    MUL         r14,r14,r1                  @II (ht - row) * src_strd
195    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
196    ADD         r5,r14,r5                   @II (ht - row) * src_strd + (wd - col)
197
198    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
199    VEXT.8      Q14,Q13,Q14,#1              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
200
201    LDRB        r14,[r6,r5]                 @II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
202    VSUB.I8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
203    SUBS        r4,r4,#1                    @Decrement row by 1
204
205    VADD.I8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
206    STRB        r14,[r2],#1                 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
207
208    VADD.I8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
209    VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
210
211    VSUB.I8     Q10,Q0,Q15                  @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
212    VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
213    VCGT.U8     Q15,Q13,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
214
215    VCLT.U8     Q0,Q13,Q14                  @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
216    VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
217    VSUB.I8     Q11,Q0,Q15                  @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
218
219    VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
220    VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
221    VMOVL.U8    Q0,D26                      @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
222
223    VADD.I8     Q14,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
224    VADD.I8     Q14,Q14,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
225
226    VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
227    VTBL.8      D28,{D10},D28               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
228    VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
229
230    VTBL.8      D29,{D10},D29               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
231    VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
232
233    VAND        Q14,Q14,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
234    VTBL.8      D17,{D11},D15               @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
235
236    VMOVL.U8    Q7,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
237    VTBL.8      D30,{D11},D28               @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
238    VADDW.S8    Q7,Q7,D17                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
239
240    VMAX.S16    Q7,Q7,Q2                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
241    VTBL.8      D31,{D11},D29               @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
242    VMIN.U16    Q7,Q7,Q3                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
243
244    VMOVN.I16   D18,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
245    VADDW.S8    Q0,Q0,D30                   @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
246
247    VMOVN.I16   D19,Q7                      @vmovn_s16(pi2_tmp_cur_row.val[1])
248    VMAX.S16    Q0,Q0,Q2                    @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
249
250    VMOVL.U8    Q14,D27                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
251    VMIN.U16    Q0,Q0,Q3                    @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
252
253    VMOVN.I16   D0,Q0                       @II vmovn_s16(pi2_tmp_cur_row.val[0])
254    VADDW.S8    Q14,Q14,D31                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
255
256    VMAX.S16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
257    VST1.8      {D18,D19},[r12],r1          @vst1q_u8(pu1_src_cpy, pu1_cur_row)
258    VMIN.U16    Q14,Q14,Q3                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
259
260    VMOVN.I16   D1,Q14                      @II vmovn_s16(pi2_tmp_cur_row.val[1])
261
262    VST1.8      {D0,D1},[r12],r1            @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
263
264    BNE         PU1_SRC_LOOP                @If not equal jump to the inner loop
265
266    ADD         r0,r0,#16                   @pu1_src += 16
267
268    SUBS        r8,r8,#16                   @Decrement column by 16
269    CMP         r8,#8                       @Check whether residue remains
270    MOV         r2,r3                       @Reload pu1_src_left
271    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
272    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
273    BLT         END_LOOPS                   @Jump to end function
274
275WIDTH_RESIDUE:
276    SUB         r6,r6,#15
277    AND         r8,r9,#0xF                  @wd_rem = wd & 0xF
278    CMP         r8,#0                       @Residue check
279    BEQ         END_LOOPS                   @No Residue jump to end function
280
281    CMP         r8,r9                       @if(wd_rem == wd)
282    BNE         AU1_MASK_FF_RESIDUE         @jump to else part
283    LDRB        r12,[r7]                    @pu1_avail[0]
284    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
285    B           SKIP_AU1_MASK_FF_RESIDUE    @Skip the else part
286
287AU1_MASK_FF_RESIDUE:
288    MOV         r12,#0xFF                   @move -s to r12
289    VMOV.8      D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
290
291SKIP_AU1_MASK_FF_RESIDUE:
292    LDRB        r11,[r7,#1]                 @pu1_avail[1]
293    SUB         r5,r9,#1                    @wd - 1
294
295    MOV         r4,r10                      @move ht to r4 for loop count
296    VMOV.8      D8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
297    MOV         r12,r0                      @pu1_src_cpy = pu1_src
298
299PU1_SRC_LOOP_RESIDUE:
300    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
301    VLD1.8      D13,[r12]                   @pu1_cur_row = vld1q_u8(pu1_src_cpy)
302    SUB         r12,#8
303    LDRB        r11,[r2]                    @load pu1_src_left
304    VMOV.8      D15[7],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
305    VEXT.8      Q7,Q7,Q6,#15                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
306
307    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
308    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
309    VSUB.I8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
310
311    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
312    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
313    VEXT.8      Q7,Q6,Q7,#1                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
314
315    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
316    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
317    VSUB.I8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
318
319    VADD.I8     Q12,Q1,Q10                  @edge_idx = vaddq_s8(const_2, sign_left)
320    VADD.I8     Q12,Q12,Q11                 @edge_idx = vaddq_s8(edge_idx, sign_right)
321
322    VTBL.8      D24,{D10},D24               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
323    VTBL.8      D25,{D10},D25               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
324
325    VAND        Q12,Q12,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
326
327    VNEG.S8     Q10,Q11                     @sign_left = vnegq_s8(sign_right)
328    VEXT.8      Q10,Q10,Q11,#15             @sign_left = vextq_s8(sign_left, sign_left, 15)
329
330    VTBL.8      D26,{D11},D24               @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
331    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
332    VADDW.S8    Q14,Q14,D26                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
333    VMAX.S16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
334    VMIN.U16    Q14,Q14,Q3                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
335
336    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
337
338    SUB         r14,r10,r4                  @ht - row
339    MUL         r14,r14,r1                  @(ht - row) * src_strd
340    ADD         r11,r14,r5                  @(ht - row) * src_strd + (wd - 1)
341    LDRB        r14,[r6, r11]               @pu1_src_org[(ht - row) * src_strd + (wd - 1)]
342    STRB        r14,[r2],#1                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
343
344    VST1.8      {D28},[r12],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
345
346    SUBS        r4,r4,#1                    @Decrement row by 1
347    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to the pu1_src loop
348
349END_LOOPS:
350    vpop        {d8  -  d15}
351    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
352
353
354
355
356