xref: /btstack/3rd-party/micro-ecc/asm_arm.inc (revision 0561b2d8d5dba972c7daa57d5e677f7a1327edfd)
1#define DEC_5 4
2#define DEC_6 5
3#define DEC_7 6
4#define DEC_8 7
5
6#define DEC(N) uECC_CONCAT(DEC_, N)
7
8#define REPEAT_1(stuff) stuff
9#define REPEAT_2(stuff) REPEAT_1(stuff) stuff
10#define REPEAT_3(stuff) REPEAT_2(stuff) stuff
11#define REPEAT_4(stuff) REPEAT_3(stuff) stuff
12#define REPEAT_5(stuff) REPEAT_4(stuff) stuff
13#define REPEAT_6(stuff) REPEAT_5(stuff) stuff
14#define REPEAT_7(stuff) REPEAT_6(stuff) stuff
15#define REPEAT_8(stuff) REPEAT_7(stuff) stuff
16
17#define REPEAT(N, stuff) uECC_CONCAT(REPEAT_, N)(stuff)
18
19#define STR2(thing) #thing
20#define STR(thing) STR2(thing)
21
22#if (uECC_ASM == uECC_asm_fast)
23
24static uint32_t vli_add(uint32_t *result, const uint32_t *left, const uint32_t *right) {
25    uint32_t carry = 0;
26    uint32_t left_word;
27    uint32_t right_word;
28
29    __asm__ volatile (
30        ".syntax unified \n\t"
31        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
32        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
33        "adds %[left], %[right] \n\t"     /* Add first word. */
34        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
35
36        /* Now we just do the remaining words with the carry bit (using ADC) */
37        REPEAT(DEC(uECC_WORDS),
38            "ldmia %[lptr]!, {%[left]} \n\t"
39            "ldmia %[rptr]!, {%[right]} \n\t"
40            "adcs %[left], %[right] \n\t"
41            "stmia %[dptr]!, {%[left]} \n\t")
42
43        "adcs %[carry], %[carry] \n\t" /* Store carry bit. */
44    #if (uECC_PLATFORM != uECC_arm_thumb2)
45        ".syntax divided \n\t"
46    #endif
47    #if (uECC_PLATFORM == uECC_arm_thumb)
48        : [dptr] "+l" (result), [lptr] "+l" (left), [rptr] "+l" (right),
49          [carry] "+l" (carry), [left] "=l" (left_word), [right] "=l" (right_word)
50    #else
51        : [dptr] "+r" (result), [lptr] "+r" (left), [rptr] "+r" (right),
52          [carry] "+r" (carry), [left] "=r" (left_word), [right] "=r" (right_word)
53    #endif
54        :
55        : "cc", "memory"
56    );
57    return carry;
58}
59#define asm_add 1
60
61static uint32_t vli_sub(uint32_t *result, const uint32_t *left, const uint32_t *right) {
62    uint32_t carry = 0;
63    uint32_t left_word;
64    uint32_t right_word;
65
66    __asm__ volatile (
67        ".syntax unified \n\t"
68        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
69        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
70        "subs %[left], %[right] \n\t"     /* Subtract. */
71        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
72
73        /* Now we just do the remaining words with the carry bit (using SBC) */
74        REPEAT(DEC(uECC_WORDS),
75            "ldmia %[lptr]!, {%[left]} \n\t"
76            "ldmia %[rptr]!, {%[right]} \n\t"
77            "sbcs %[left], %[right] \n\t"
78            "stmia %[dptr]!, {%[left]} \n\t")
79
80        "adcs %[carry], %[carry] \n\t" /* Store carry bit. */
81    #if (uECC_PLATFORM != uECC_arm_thumb2)
82        ".syntax divided \n\t"
83    #endif
84    #if (uECC_PLATFORM == uECC_arm_thumb)
85        : [dptr] "+l" (result), [lptr] "+l" (left), [rptr] "+l" (right),
86          [carry] "+l" (carry), [left] "=l" (left_word), [right] "=l" (right_word)
87    #else
88        : [dptr] "+r" (result), [lptr] "+r" (left), [rptr] "+r" (right),
89          [carry] "+r" (carry), [left] "=r" (left_word), [right] "=r" (right_word)
90    #endif
91        :
92        : "cc", "memory"
93    );
94    return !carry; // note that on ARM, carry flag set means "no borrow" when subtracting
95                   // (for some reason...)
96}
97#define asm_sub 1
98
99#if (uECC_PLATFORM != uECC_arm_thumb)
100#if (uECC_WORDS == 5)
101static void vli_mult(uint32_t *result, const uint32_t *left, const uint32_t *right) {
102    register uint32_t *r0 __asm__("r0") = result;
103    register const uint32_t *r1 __asm__("r1") = left;
104    register const uint32_t *r2 __asm__("r2") = right;
105
106    __asm__ volatile (
107        ".syntax unified \n\t"
108        "add r0, 12 \n\t"
109        "add r2, 12 \n\t"
110        "ldmia r1!, {r3,r4} \n\t"
111        "ldmia r2!, {r6,r7} \n\t"
112
113        "umull r11, r12, r3, r6 \n\t"
114        "stmia r0!, {r11} \n\t"
115
116        "mov r10, #0 \n\t"
117        "umull r11, r9, r3, r7 \n\t"
118        "adds r12, r11 \n\t"
119        "adc r9, #0 \n\t"
120        "umull r11, r14, r4, r6 \n\t"
121        "adds r12, r11 \n\t"
122        "adcs r9, r14 \n\t"
123        "adc r10, #0 \n\t"
124        "stmia r0!, {r12} \n\t"
125
126        "umull r12, r14, r4, r7 \n\t"
127        "adds r9, r12 \n\t"
128        "adc r10, r14 \n\t"
129        "stmia r0!, {r9, r10} \n\t"
130
131        "sub r0, 28 \n\t"
132        "sub r2, 20 \n\t"
133        "ldmia r2!, {r6,r7,r8} \n\t"
134        "ldmia r1!, {r5} \n\t"
135
136        "umull r11, r12, r3, r6 \n\t"
137        "stmia r0!, {r11} \n\t"
138
139        "mov r10, #0 \n\t"
140        "umull r11, r9, r3, r7 \n\t"
141        "adds r12, r11 \n\t"
142        "adc r9, #0 \n\t"
143        "umull r11, r14, r4, r6 \n\t"
144        "adds r12, r11 \n\t"
145        "adcs r9, r14 \n\t"
146        "adc r10, #0 \n\t"
147        "stmia r0!, {r12} \n\t"
148
149        "mov r11, #0 \n\t"
150        "umull r12, r14, r3, r8 \n\t"
151        "adds r9, r12 \n\t"
152        "adcs r10, r14 \n\t"
153        "adc r11, #0 \n\t"
154        "umull r12, r14, r4, r7 \n\t"
155        "adds r9, r12 \n\t"
156        "adcs r10, r14 \n\t"
157        "adc r11, #0 \n\t"
158        "umull r12, r14, r5, r6 \n\t"
159        "adds r9, r12 \n\t"
160        "adcs r10, r14 \n\t"
161        "adc r11, #0 \n\t"
162        "stmia r0!, {r9} \n\t"
163
164        "ldmia r1!, {r3} \n\t"
165        "mov r12, #0 \n\t"
166        "umull r14, r9, r4, r8 \n\t"
167        "adds r10, r14 \n\t"
168        "adcs r11, r9 \n\t"
169        "adc r12, #0 \n\t"
170        "umull r14, r9, r5, r7 \n\t"
171        "adds r10, r14 \n\t"
172        "adcs r11, r9 \n\t"
173        "adc r12, #0 \n\t"
174        "umull r14, r9, r3, r6 \n\t"
175        "adds r10, r14 \n\t"
176        "adcs r11, r9 \n\t"
177        "adc r12, #0 \n\t"
178        "ldr r14, [r0] \n\t"
179        "adds r10, r14 \n\t"
180        "adcs r11, #0 \n\t"
181        "adc r12, #0 \n\t"
182        "stmia r0!, {r10} \n\t"
183
184        "ldmia r1!, {r4} \n\t"
185        "mov r14, #0 \n\t"
186        "umull r9, r10, r5, r8 \n\t"
187        "adds r11, r9 \n\t"
188        "adcs r12, r10 \n\t"
189        "adc r14, #0 \n\t"
190        "umull r9, r10, r3, r7 \n\t"
191        "adds r11, r9 \n\t"
192        "adcs r12, r10 \n\t"
193        "adc r14, #0 \n\t"
194        "umull r9, r10, r4, r6 \n\t"
195        "adds r11, r9 \n\t"
196        "adcs r12, r10 \n\t"
197        "adc r14, #0 \n\t"
198        "ldr r9, [r0] \n\t"
199        "adds r11, r9 \n\t"
200        "adcs r12, #0 \n\t"
201        "adc r14, #0 \n\t"
202        "stmia r0!, {r11} \n\t"
203
204        "ldmia r2!, {r6} \n\t"
205        "mov r9, #0 \n\t"
206        "umull r10, r11, r5, r6 \n\t"
207        "adds r12, r10 \n\t"
208        "adcs r14, r11 \n\t"
209        "adc r9, #0 \n\t"
210        "umull r10, r11, r3, r8 \n\t"
211        "adds r12, r10 \n\t"
212        "adcs r14, r11 \n\t"
213        "adc r9, #0 \n\t"
214        "umull r10, r11, r4, r7 \n\t"
215        "adds r12, r10 \n\t"
216        "adcs r14, r11 \n\t"
217        "adc r9, #0 \n\t"
218        "ldr r10, [r0] \n\t"
219        "adds r12, r10 \n\t"
220        "adcs r14, #0 \n\t"
221        "adc r9, #0 \n\t"
222        "stmia r0!, {r12} \n\t"
223
224        "ldmia r2!, {r7} \n\t"
225        "mov r10, #0 \n\t"
226        "umull r11, r12, r5, r7 \n\t"
227        "adds r14, r11 \n\t"
228        "adcs r9, r12 \n\t"
229        "adc r10, #0 \n\t"
230        "umull r11, r12, r3, r6 \n\t"
231        "adds r14, r11 \n\t"
232        "adcs r9, r12 \n\t"
233        "adc r10, #0 \n\t"
234        "umull r11, r12, r4, r8 \n\t"
235        "adds r14, r11 \n\t"
236        "adcs r9, r12 \n\t"
237        "adc r10, #0 \n\t"
238        "ldr r11, [r0] \n\t"
239        "adds r14, r11 \n\t"
240        "adcs r9, #0 \n\t"
241        "adc r10, #0 \n\t"
242        "stmia r0!, {r14} \n\t"
243
244        "mov r11, #0 \n\t"
245        "umull r12, r14, r3, r7 \n\t"
246        "adds r9, r12 \n\t"
247        "adcs r10, r14 \n\t"
248        "adc r11, #0 \n\t"
249        "umull r12, r14, r4, r6 \n\t"
250        "adds r9, r12 \n\t"
251        "adcs r10, r14 \n\t"
252        "adc r11, #0 \n\t"
253        "stmia r0!, {r9} \n\t"
254
255        "umull r14, r9, r4, r7 \n\t"
256        "adds r10, r14 \n\t"
257        "adc r11, r9 \n\t"
258        "stmia r0!, {r10, r11} \n\t"
259    #if (uECC_PLATFORM != uECC_arm_thumb2)
260        ".syntax divided \n\t"
261    #endif
262        : "+r" (r0), "+r" (r1), "+r" (r2)
263        :
264        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
265    );
266}
267#define asm_mult 1
268#endif /* (uECC_WORDS == 5) */
269
270#if (uECC_WORDS == 6)
271static void vli_mult(uint32_t *result, const uint32_t *left, const uint32_t *right) {
272    register uint32_t *r0 __asm__("r0") = result;
273    register const uint32_t *r1 __asm__("r1") = left;
274    register const uint32_t *r2 __asm__("r2") = right;
275
276    __asm__ volatile (
277        ".syntax unified \n\t"
278        "add r0, 12 \n\t"
279        "add r2, 12 \n\t"
280        "ldmia r1!, {r3,r4,r5} \n\t"
281        "ldmia r2!, {r6,r7,r8} \n\t"
282
283        "umull r11, r12, r3, r6 \n\t"
284        "stmia r0!, {r11} \n\t"
285
286        "mov r10, #0 \n\t"
287        "umull r11, r9, r3, r7 \n\t"
288        "adds r12, r11 \n\t"
289        "adc r9, #0 \n\t"
290        "umull r11, r14, r4, r6 \n\t"
291        "adds r12, r11 \n\t"
292        "adcs r9, r14 \n\t"
293        "adc r10, #0 \n\t"
294        "stmia r0!, {r12} \n\t"
295
296        "mov r11, #0 \n\t"
297        "umull r12, r14, r3, r8 \n\t"
298        "adds r9, r12 \n\t"
299        "adcs r10, r14 \n\t"
300        "adc r11, #0 \n\t"
301        "umull r12, r14, r4, r7 \n\t"
302        "adds r9, r12 \n\t"
303        "adcs r10, r14 \n\t"
304        "adc r11, #0 \n\t"
305        "umull r12, r14, r5, r6 \n\t"
306        "adds r9, r12 \n\t"
307        "adcs r10, r14 \n\t"
308        "adc r11, #0 \n\t"
309        "stmia r0!, {r9} \n\t"
310
311        "mov r12, #0 \n\t"
312        "umull r14, r9, r4, r8 \n\t"
313        "adds r10, r14 \n\t"
314        "adcs r11, r9 \n\t"
315        "adc r12, #0 \n\t"
316        "umull r14, r9, r5, r7 \n\t"
317        "adds r10, r14 \n\t"
318        "adcs r11, r9 \n\t"
319        "adc r12, #0 \n\t"
320        "stmia r0!, {r10} \n\t"
321
322        "umull r9, r10, r5, r8 \n\t"
323        "adds r11, r9 \n\t"
324        "adc r12, r10 \n\t"
325        "stmia r0!, {r11, r12} \n\t"
326
327        "sub r0, 36 \n\t"
328        "sub r2, 24 \n\t"
329        "ldmia r2!, {r6,r7,r8} \n\t"
330
331        "umull r11, r12, r3, r6 \n\t"
332        "stmia r0!, {r11} \n\t"
333
334        "mov r10, #0 \n\t"
335        "umull r11, r9, r3, r7 \n\t"
336        "adds r12, r11 \n\t"
337        "adc r9, #0 \n\t"
338        "umull r11, r14, r4, r6 \n\t"
339        "adds r12, r11 \n\t"
340        "adcs r9, r14 \n\t"
341        "adc r10, #0 \n\t"
342        "stmia r0!, {r12} \n\t"
343
344        "mov r11, #0 \n\t"
345        "umull r12, r14, r3, r8 \n\t"
346        "adds r9, r12 \n\t"
347        "adcs r10, r14 \n\t"
348        "adc r11, #0 \n\t"
349        "umull r12, r14, r4, r7 \n\t"
350        "adds r9, r12 \n\t"
351        "adcs r10, r14 \n\t"
352        "adc r11, #0 \n\t"
353        "umull r12, r14, r5, r6 \n\t"
354        "adds r9, r12 \n\t"
355        "adcs r10, r14 \n\t"
356        "adc r11, #0 \n\t"
357        "stmia r0!, {r9} \n\t"
358
359        "ldmia r1!, {r3} \n\t"
360        "mov r12, #0 \n\t"
361        "umull r14, r9, r4, r8 \n\t"
362        "adds r10, r14 \n\t"
363        "adcs r11, r9 \n\t"
364        "adc r12, #0 \n\t"
365        "umull r14, r9, r5, r7 \n\t"
366        "adds r10, r14 \n\t"
367        "adcs r11, r9 \n\t"
368        "adc r12, #0 \n\t"
369        "umull r14, r9, r3, r6 \n\t"
370        "adds r10, r14 \n\t"
371        "adcs r11, r9 \n\t"
372        "adc r12, #0 \n\t"
373        "ldr r14, [r0] \n\t"
374        "adds r10, r14 \n\t"
375        "adcs r11, #0 \n\t"
376        "adc r12, #0 \n\t"
377        "stmia r0!, {r10} \n\t"
378
379        "ldmia r1!, {r4} \n\t"
380        "mov r14, #0 \n\t"
381        "umull r9, r10, r5, r8 \n\t"
382        "adds r11, r9 \n\t"
383        "adcs r12, r10 \n\t"
384        "adc r14, #0 \n\t"
385        "umull r9, r10, r3, r7 \n\t"
386        "adds r11, r9 \n\t"
387        "adcs r12, r10 \n\t"
388        "adc r14, #0 \n\t"
389        "umull r9, r10, r4, r6 \n\t"
390        "adds r11, r9 \n\t"
391        "adcs r12, r10 \n\t"
392        "adc r14, #0 \n\t"
393        "ldr r9, [r0] \n\t"
394        "adds r11, r9 \n\t"
395        "adcs r12, #0 \n\t"
396        "adc r14, #0 \n\t"
397        "stmia r0!, {r11} \n\t"
398
399        "ldmia r1!, {r5} \n\t"
400        "mov r9, #0 \n\t"
401        "umull r10, r11, r3, r8 \n\t"
402        "adds r12, r10 \n\t"
403        "adcs r14, r11 \n\t"
404        "adc r9, #0 \n\t"
405        "umull r10, r11, r4, r7 \n\t"
406        "adds r12, r10 \n\t"
407        "adcs r14, r11 \n\t"
408        "adc r9, #0 \n\t"
409        "umull r10, r11, r5, r6 \n\t"
410        "adds r12, r10 \n\t"
411        "adcs r14, r11 \n\t"
412        "adc r9, #0 \n\t"
413        "ldr r10, [r0] \n\t"
414        "adds r12, r10 \n\t"
415        "adcs r14, #0 \n\t"
416        "adc r9, #0 \n\t"
417        "stmia r0!, {r12} \n\t"
418
419        "ldmia r2!, {r6} \n\t"
420        "mov r10, #0 \n\t"
421        "umull r11, r12, r3, r6 \n\t"
422        "adds r14, r11 \n\t"
423        "adcs r9, r12 \n\t"
424        "adc r10, #0 \n\t"
425        "umull r11, r12, r4, r8 \n\t"
426        "adds r14, r11 \n\t"
427        "adcs r9, r12 \n\t"
428        "adc r10, #0 \n\t"
429        "umull r11, r12, r5, r7 \n\t"
430        "adds r14, r11 \n\t"
431        "adcs r9, r12 \n\t"
432        "adc r10, #0 \n\t"
433        "ldr r11, [r0] \n\t"
434        "adds r14, r11 \n\t"
435        "adcs r9, #0 \n\t"
436        "adc r10, #0 \n\t"
437        "stmia r0!, {r14} \n\t"
438
439        "ldmia r2!, {r7} \n\t"
440        "mov r11, #0 \n\t"
441        "umull r12, r14, r3, r7 \n\t"
442        "adds r9, r12 \n\t"
443        "adcs r10, r14 \n\t"
444        "adc r11, #0 \n\t"
445        "umull r12, r14, r4, r6 \n\t"
446        "adds r9, r12 \n\t"
447        "adcs r10, r14 \n\t"
448        "adc r11, #0 \n\t"
449        "umull r12, r14, r5, r8 \n\t"
450        "adds r9, r12 \n\t"
451        "adcs r10, r14 \n\t"
452        "adc r11, #0 \n\t"
453        "ldr r12, [r0] \n\t"
454        "adds r9, r12 \n\t"
455        "adcs r10, #0 \n\t"
456        "adc r11, #0 \n\t"
457        "stmia r0!, {r9} \n\t"
458
459        "ldmia r2!, {r8} \n\t"
460        "mov r12, #0 \n\t"
461        "umull r14, r9, r3, r8 \n\t"
462        "adds r10, r14 \n\t"
463        "adcs r11, r9 \n\t"
464        "adc r12, #0 \n\t"
465        "umull r14, r9, r4, r7 \n\t"
466        "adds r10, r14 \n\t"
467        "adcs r11, r9 \n\t"
468        "adc r12, #0 \n\t"
469        "umull r14, r9, r5, r6 \n\t"
470        "adds r10, r14 \n\t"
471        "adcs r11, r9 \n\t"
472        "adc r12, #0 \n\t"
473        "ldr r14, [r0] \n\t"
474        "adds r10, r14 \n\t"
475        "adcs r11, #0 \n\t"
476        "adc r12, #0 \n\t"
477        "stmia r0!, {r10} \n\t"
478
479        "mov r14, #0 \n\t"
480        "umull r9, r10, r4, r8 \n\t"
481        "adds r11, r9 \n\t"
482        "adcs r12, r10 \n\t"
483        "adc r14, #0 \n\t"
484        "umull r9, r10, r5, r7 \n\t"
485        "adds r11, r9 \n\t"
486        "adcs r12, r10 \n\t"
487        "adc r14, #0 \n\t"
488        "stmia r0!, {r11} \n\t"
489
490        "umull r10, r11, r5, r8 \n\t"
491        "adds r12, r10 \n\t"
492        "adc r14, r11 \n\t"
493        "stmia r0!, {r12, r14} \n\t"
494    #if (uECC_PLATFORM != uECC_arm_thumb2)
495        ".syntax divided \n\t"
496    #endif
497        : "+r" (r0), "+r" (r1), "+r" (r2)
498        :
499        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
500    );
501}
502#define asm_mult 1
503#endif /* (uECC_WORDS == 6) */
504
505#if (uECC_WORDS == 7)
506static void vli_mult(uint32_t *result, const uint32_t *left, const uint32_t *right) {
507    register uint32_t *r0 __asm__("r0") = result;
508    register const uint32_t *r1 __asm__("r1") = left;
509    register const uint32_t *r2 __asm__("r2") = right;
510
511    __asm__ volatile (
512        ".syntax unified \n\t"
513        "add r0, 24 \n\t"
514        "add r2, 24 \n\t"
515        "ldmia r1!, {r3} \n\t"
516        "ldmia r2!, {r6} \n\t"
517
518        "umull r9, r10, r3, r6 \n\t"
519        "stmia r0!, {r9, r10} \n\t"
520
521        "sub r0, 20 \n\t"
522        "sub r2, 16 \n\t"
523        "ldmia r2!, {r6, r7, r8} \n\t"
524        "ldmia r1!, {r4, r5} \n\t"
525
526        "umull r9, r10, r3, r6 \n\t"
527        "stmia r0!, {r9} \n\t"
528
529        "mov r14, #0 \n\t"
530        "umull r9, r12, r3, r7 \n\t"
531        "adds r10, r9 \n\t"
532        "adc r12, #0 \n\t"
533        "umull r9, r11, r4, r6 \n\t"
534        "adds r10, r9 \n\t"
535        "adcs r12, r11 \n\t"
536        "adc r14, #0 \n\t"
537        "stmia r0!, {r10} \n\t"
538
539        "mov r9, #0 \n\t"
540        "umull r10, r11, r3, r8 \n\t"
541        "adds r12, r10 \n\t"
542        "adcs r14, r11 \n\t"
543        "adc r9, #0 \n\t"
544        "umull r10, r11, r4, r7 \n\t"
545        "adds r12, r10 \n\t"
546        "adcs r14, r11 \n\t"
547        "adc r9, #0 \n\t"
548        "umull r10, r11, r5, r6 \n\t"
549        "adds r12, r10 \n\t"
550        "adcs r14, r11 \n\t"
551        "adc r9, #0 \n\t"
552        "stmia r0!, {r12} \n\t"
553
554        "ldmia r1!, {r3} \n\t"
555        "mov r10, #0 \n\t"
556        "umull r11, r12, r4, r8 \n\t"
557        "adds r14, r11 \n\t"
558        "adcs r9, r12 \n\t"
559        "adc r10, #0 \n\t"
560        "umull r11, r12, r5, r7 \n\t"
561        "adds r14, r11 \n\t"
562        "adcs r9, r12 \n\t"
563        "adc r10, #0 \n\t"
564        "umull r11, r12, r3, r6 \n\t"
565        "adds r14, r11 \n\t"
566        "adcs r9, r12 \n\t"
567        "adc r10, #0 \n\t"
568        "ldr r11, [r0] \n\t"
569        "adds r14, r11 \n\t"
570        "adcs r9, #0 \n\t"
571        "adc r10, #0 \n\t"
572        "stmia r0!, {r14} \n\t"
573
574        "ldmia r2!, {r6} \n\t"
575        "mov r11, #0 \n\t"
576        "umull r12, r14, r4, r6 \n\t"
577        "adds r9, r12 \n\t"
578        "adcs r10, r14 \n\t"
579        "adc r11, #0 \n\t"
580        "umull r12, r14, r5, r8 \n\t"
581        "adds r9, r12 \n\t"
582        "adcs r10, r14 \n\t"
583        "adc r11, #0 \n\t"
584        "umull r12, r14, r3, r7 \n\t"
585        "adds r9, r12 \n\t"
586        "adcs r10, r14 \n\t"
587        "adc r11, #0 \n\t"
588        "ldr r12, [r0] \n\t"
589        "adds r9, r12 \n\t"
590        "adcs r10, #0 \n\t"
591        "adc r11, #0 \n\t"
592        "stmia r0!, {r9} \n\t"
593
594        "mov r12, #0 \n\t"
595        "umull r14, r9, r5, r6 \n\t"
596        "adds r10, r14 \n\t"
597        "adcs r11, r9 \n\t"
598        "adc r12, #0 \n\t"
599        "umull r14, r9, r3, r8 \n\t"
600        "adds r10, r14 \n\t"
601        "adcs r11, r9 \n\t"
602        "adc r12, #0 \n\t"
603        "stmia r0!, {r10} \n\t"
604
605        "umull r9, r10, r3, r6 \n\t"
606        "adds r11, r9 \n\t"
607        "adc r12, r10 \n\t"
608        "stmia r0!, {r11, r12} \n\t"
609
610        "sub r0, 44 \n\t"
611        "sub r1, 16 \n\t"
612        "sub r2, 28 \n\t"
613        "ldmia r1!, {r3,r4,r5} \n\t"
614        "ldmia r2!, {r6,r7,r8} \n\t"
615
616        "umull r9, r10, r3, r6 \n\t"
617        "stmia r0!, {r9} \n\t"
618
619        "mov r14, #0 \n\t"
620        "umull r9, r12, r3, r7 \n\t"
621        "adds r10, r9 \n\t"
622        "adc r12, #0 \n\t"
623        "umull r9, r11, r4, r6 \n\t"
624        "adds r10, r9 \n\t"
625        "adcs r12, r11 \n\t"
626        "adc r14, #0 \n\t"
627        "stmia r0!, {r10} \n\t"
628
629        "mov r9, #0 \n\t"
630        "umull r10, r11, r3, r8 \n\t"
631        "adds r12, r10 \n\t"
632        "adcs r14, r11 \n\t"
633        "adc r9, #0 \n\t"
634        "umull r10, r11, r4, r7 \n\t"
635        "adds r12, r10 \n\t"
636        "adcs r14, r11 \n\t"
637        "adc r9, #0 \n\t"
638        "umull r10, r11, r5, r6 \n\t"
639        "adds r12, r10 \n\t"
640        "adcs r14, r11 \n\t"
641        "adc r9, #0 \n\t"
642        "stmia r0!, {r12} \n\t"
643
644        "ldmia r1!, {r3} \n\t"
645        "mov r10, #0 \n\t"
646        "umull r11, r12, r4, r8 \n\t"
647        "adds r14, r11 \n\t"
648        "adcs r9, r12 \n\t"
649        "adc r10, #0 \n\t"
650        "umull r11, r12, r5, r7 \n\t"
651        "adds r14, r11 \n\t"
652        "adcs r9, r12 \n\t"
653        "adc r10, #0 \n\t"
654        "umull r11, r12, r3, r6 \n\t"
655        "adds r14, r11 \n\t"
656        "adcs r9, r12 \n\t"
657        "adc r10, #0 \n\t"
658        "ldr r11, [r0] \n\t"
659        "adds r14, r11 \n\t"
660        "adcs r9, #0 \n\t"
661        "adc r10, #0 \n\t"
662        "stmia r0!, {r14} \n\t"
663
664        "ldmia r1!, {r4} \n\t"
665        "mov r11, #0 \n\t"
666        "umull r12, r14, r5, r8 \n\t"
667        "adds r9, r12 \n\t"
668        "adcs r10, r14 \n\t"
669        "adc r11, #0 \n\t"
670        "umull r12, r14, r3, r7 \n\t"
671        "adds r9, r12 \n\t"
672        "adcs r10, r14 \n\t"
673        "adc r11, #0 \n\t"
674        "umull r12, r14, r4, r6 \n\t"
675        "adds r9, r12 \n\t"
676        "adcs r10, r14 \n\t"
677        "adc r11, #0 \n\t"
678        "ldr r12, [r0] \n\t"
679        "adds r9, r12 \n\t"
680        "adcs r10, #0 \n\t"
681        "adc r11, #0 \n\t"
682        "stmia r0!, {r9} \n\t"
683
684        "ldmia r1!, {r5} \n\t"
685        "mov r12, #0 \n\t"
686        "umull r14, r9, r3, r8 \n\t"
687        "adds r10, r14 \n\t"
688        "adcs r11, r9 \n\t"
689        "adc r12, #0 \n\t"
690        "umull r14, r9, r4, r7 \n\t"
691        "adds r10, r14 \n\t"
692        "adcs r11, r9 \n\t"
693        "adc r12, #0 \n\t"
694        "umull r14, r9, r5, r6 \n\t"
695        "adds r10, r14 \n\t"
696        "adcs r11, r9 \n\t"
697        "adc r12, #0 \n\t"
698        "ldr r14, [r0] \n\t"
699        "adds r10, r14 \n\t"
700        "adcs r11, #0 \n\t"
701        "adc r12, #0 \n\t"
702        "stmia r0!, {r10} \n\t"
703
704        "ldmia r1!, {r3} \n\t"
705        "mov r14, #0 \n\t"
706        "umull r9, r10, r4, r8 \n\t"
707        "adds r11, r9 \n\t"
708        "adcs r12, r10 \n\t"
709        "adc r14, #0 \n\t"
710        "umull r9, r10, r5, r7 \n\t"
711        "adds r11, r9 \n\t"
712        "adcs r12, r10 \n\t"
713        "adc r14, #0 \n\t"
714        "umull r9, r10, r3, r6 \n\t"
715        "adds r11, r9 \n\t"
716        "adcs r12, r10 \n\t"
717        "adc r14, #0 \n\t"
718        "ldr r9, [r0] \n\t"
719        "adds r11, r9 \n\t"
720        "adcs r12, #0 \n\t"
721        "adc r14, #0 \n\t"
722        "stmia r0!, {r11} \n\t"
723
724        "ldmia r2!, {r6} \n\t"
725        "mov r9, #0 \n\t"
726        "umull r10, r11, r4, r6 \n\t"
727        "adds r12, r10 \n\t"
728        "adcs r14, r11 \n\t"
729        "adc r9, #0 \n\t"
730        "umull r10, r11, r5, r8 \n\t"
731        "adds r12, r10 \n\t"
732        "adcs r14, r11 \n\t"
733        "adc r9, #0 \n\t"
734        "umull r10, r11, r3, r7 \n\t"
735        "adds r12, r10 \n\t"
736        "adcs r14, r11 \n\t"
737        "adc r9, #0 \n\t"
738        "ldr r10, [r0] \n\t"
739        "adds r12, r10 \n\t"
740        "adcs r14, #0 \n\t"
741        "adc r9, #0 \n\t"
742        "stmia r0!, {r12} \n\t"
743
744        "ldmia r2!, {r7} \n\t"
745        "mov r10, #0 \n\t"
746        "umull r11, r12, r4, r7 \n\t"
747        "adds r14, r11 \n\t"
748        "adcs r9, r12 \n\t"
749        "adc r10, #0 \n\t"
750        "umull r11, r12, r5, r6 \n\t"
751        "adds r14, r11 \n\t"
752        "adcs r9, r12 \n\t"
753        "adc r10, #0 \n\t"
754        "umull r11, r12, r3, r8 \n\t"
755        "adds r14, r11 \n\t"
756        "adcs r9, r12 \n\t"
757        "adc r10, #0 \n\t"
758        "ldr r11, [r0] \n\t"
759        "adds r14, r11 \n\t"
760        "adcs r9, #0 \n\t"
761        "adc r10, #0 \n\t"
762        "stmia r0!, {r14} \n\t"
763
764        "ldmia r2!, {r8} \n\t"
765        "mov r11, #0 \n\t"
766        "umull r12, r14, r4, r8 \n\t"
767        "adds r9, r12 \n\t"
768        "adcs r10, r14 \n\t"
769        "adc r11, #0 \n\t"
770        "umull r12, r14, r5, r7 \n\t"
771        "adds r9, r12 \n\t"
772        "adcs r10, r14 \n\t"
773        "adc r11, #0 \n\t"
774        "umull r12, r14, r3, r6 \n\t"
775        "adds r9, r12 \n\t"
776        "adcs r10, r14 \n\t"
777        "adc r11, #0 \n\t"
778        "ldr r12, [r0] \n\t"
779        "adds r9, r12 \n\t"
780        "adcs r10, #0 \n\t"
781        "adc r11, #0 \n\t"
782        "stmia r0!, {r9} \n\t"
783
784        "ldmia r2!, {r6} \n\t"
785        "mov r12, #0 \n\t"
786        "umull r14, r9, r4, r6 \n\t"
787        "adds r10, r14 \n\t"
788        "adcs r11, r9 \n\t"
789        "adc r12, #0 \n\t"
790        "umull r14, r9, r5, r8 \n\t"
791        "adds r10, r14 \n\t"
792        "adcs r11, r9 \n\t"
793        "adc r12, #0 \n\t"
794        "umull r14, r9, r3, r7 \n\t"
795        "adds r10, r14 \n\t"
796        "adcs r11, r9 \n\t"
797        "adc r12, #0 \n\t"
798        "ldr r14, [r0] \n\t"
799        "adds r10, r14 \n\t"
800        "adcs r11, #0 \n\t"
801        "adc r12, #0 \n\t"
802        "stmia r0!, {r10} \n\t"
803
804        "mov r14, #0 \n\t"
805        "umull r9, r10, r5, r6 \n\t"
806        "adds r11, r9 \n\t"
807        "adcs r12, r10 \n\t"
808        "adc r14, #0 \n\t"
809        "umull r9, r10, r3, r8 \n\t"
810        "adds r11, r9 \n\t"
811        "adcs r12, r10 \n\t"
812        "adc r14, #0 \n\t"
813        "stmia r0!, {r11} \n\t"
814
815        "umull r10, r11, r3, r6 \n\t"
816        "adds r12, r10 \n\t"
817        "adc r14, r11 \n\t"
818        "stmia r0!, {r12, r14} \n\t"
819    #if (uECC_PLATFORM != uECC_arm_thumb2)
820        ".syntax divided \n\t"
821    #endif
822        : "+r" (r0), "+r" (r1), "+r" (r2)
823        :
824        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
825    );
826}
827#define asm_mult 1
828#endif /* (uECC_WORDS == 7) */
829
830#if (uECC_WORDS == 8)
831static void vli_mult(uint32_t *result, const uint32_t *left, const uint32_t *right) {
832    register uint32_t *r0 __asm__("r0") = result;
833    register const uint32_t *r1 __asm__("r1") = left;
834    register const uint32_t *r2 __asm__("r2") = right;
835
836    __asm__ volatile (
837        ".syntax unified \n\t"
838        "add r0, 24 \n\t"
839        "add r2, 24 \n\t"
840        "ldmia r1!, {r3,r4} \n\t"
841        "ldmia r2!, {r6,r7} \n\t"
842
843        "umull r11, r12, r3, r6 \n\t"
844        "stmia r0!, {r11} \n\t"
845
846        "mov r10, #0 \n\t"
847        "umull r11, r9, r3, r7 \n\t"
848        "adds r12, r11 \n\t"
849        "adc r9, #0 \n\t"
850        "umull r11, r14, r4, r6 \n\t"
851        "adds r12, r11 \n\t"
852        "adcs r9, r14 \n\t"
853        "adc r10, #0 \n\t"
854        "stmia r0!, {r12} \n\t"
855
856        "umull r12, r14, r4, r7 \n\t"
857        "adds r9, r12 \n\t"
858        "adc r10, r14 \n\t"
859        "stmia r0!, {r9, r10} \n\t"
860
861        "sub r0, 28 \n\t"
862        "sub r2, 20 \n\t"
863        "ldmia r2!, {r6,r7,r8} \n\t"
864        "ldmia r1!, {r5} \n\t"
865
866        "umull r11, r12, r3, r6 \n\t"
867        "stmia r0!, {r11} \n\t"
868
869        "mov r10, #0 \n\t"
870        "umull r11, r9, r3, r7 \n\t"
871        "adds r12, r11 \n\t"
872        "adc r9, #0 \n\t"
873        "umull r11, r14, r4, r6 \n\t"
874        "adds r12, r11 \n\t"
875        "adcs r9, r14 \n\t"
876        "adc r10, #0 \n\t"
877        "stmia r0!, {r12} \n\t"
878
879        "mov r11, #0 \n\t"
880        "umull r12, r14, r3, r8 \n\t"
881        "adds r9, r12 \n\t"
882        "adcs r10, r14 \n\t"
883        "adc r11, #0 \n\t"
884        "umull r12, r14, r4, r7 \n\t"
885        "adds r9, r12 \n\t"
886        "adcs r10, r14 \n\t"
887        "adc r11, #0 \n\t"
888        "umull r12, r14, r5, r6 \n\t"
889        "adds r9, r12 \n\t"
890        "adcs r10, r14 \n\t"
891        "adc r11, #0 \n\t"
892        "stmia r0!, {r9} \n\t"
893
894        "ldmia r1!, {r3} \n\t"
895        "mov r12, #0 \n\t"
896        "umull r14, r9, r4, r8 \n\t"
897        "adds r10, r14 \n\t"
898        "adcs r11, r9 \n\t"
899        "adc r12, #0 \n\t"
900        "umull r14, r9, r5, r7 \n\t"
901        "adds r10, r14 \n\t"
902        "adcs r11, r9 \n\t"
903        "adc r12, #0 \n\t"
904        "umull r14, r9, r3, r6 \n\t"
905        "adds r10, r14 \n\t"
906        "adcs r11, r9 \n\t"
907        "adc r12, #0 \n\t"
908        "ldr r14, [r0] \n\t"
909        "adds r10, r14 \n\t"
910        "adcs r11, #0 \n\t"
911        "adc r12, #0 \n\t"
912        "stmia r0!, {r10} \n\t"
913
914        "ldmia r1!, {r4} \n\t"
915        "mov r14, #0 \n\t"
916        "umull r9, r10, r5, r8 \n\t"
917        "adds r11, r9 \n\t"
918        "adcs r12, r10 \n\t"
919        "adc r14, #0 \n\t"
920        "umull r9, r10, r3, r7 \n\t"
921        "adds r11, r9 \n\t"
922        "adcs r12, r10 \n\t"
923        "adc r14, #0 \n\t"
924        "umull r9, r10, r4, r6 \n\t"
925        "adds r11, r9 \n\t"
926        "adcs r12, r10 \n\t"
927        "adc r14, #0 \n\t"
928        "ldr r9, [r0] \n\t"
929        "adds r11, r9 \n\t"
930        "adcs r12, #0 \n\t"
931        "adc r14, #0 \n\t"
932        "stmia r0!, {r11} \n\t"
933
934        "ldmia r2!, {r6} \n\t"
935        "mov r9, #0 \n\t"
936        "umull r10, r11, r5, r6 \n\t"
937        "adds r12, r10 \n\t"
938        "adcs r14, r11 \n\t"
939        "adc r9, #0 \n\t"
940        "umull r10, r11, r3, r8 \n\t"
941        "adds r12, r10 \n\t"
942        "adcs r14, r11 \n\t"
943        "adc r9, #0 \n\t"
944        "umull r10, r11, r4, r7 \n\t"
945        "adds r12, r10 \n\t"
946        "adcs r14, r11 \n\t"
947        "adc r9, #0 \n\t"
948        "ldr r10, [r0] \n\t"
949        "adds r12, r10 \n\t"
950        "adcs r14, #0 \n\t"
951        "adc r9, #0 \n\t"
952        "stmia r0!, {r12} \n\t"
953
954        "ldmia r2!, {r7} \n\t"
955        "mov r10, #0 \n\t"
956        "umull r11, r12, r5, r7 \n\t"
957        "adds r14, r11 \n\t"
958        "adcs r9, r12 \n\t"
959        "adc r10, #0 \n\t"
960        "umull r11, r12, r3, r6 \n\t"
961        "adds r14, r11 \n\t"
962        "adcs r9, r12 \n\t"
963        "adc r10, #0 \n\t"
964        "umull r11, r12, r4, r8 \n\t"
965        "adds r14, r11 \n\t"
966        "adcs r9, r12 \n\t"
967        "adc r10, #0 \n\t"
968        "ldr r11, [r0] \n\t"
969        "adds r14, r11 \n\t"
970        "adcs r9, #0 \n\t"
971        "adc r10, #0 \n\t"
972        "stmia r0!, {r14} \n\t"
973
974        "mov r11, #0 \n\t"
975        "umull r12, r14, r3, r7 \n\t"
976        "adds r9, r12 \n\t"
977        "adcs r10, r14 \n\t"
978        "adc r11, #0 \n\t"
979        "umull r12, r14, r4, r6 \n\t"
980        "adds r9, r12 \n\t"
981        "adcs r10, r14 \n\t"
982        "adc r11, #0 \n\t"
983        "stmia r0!, {r9} \n\t"
984
985        "umull r14, r9, r4, r7 \n\t"
986        "adds r10, r14 \n\t"
987        "adc r11, r9 \n\t"
988        "stmia r0!, {r10, r11} \n\t"
989
990        "sub r0, 52 \n\t"
991        "sub r1, 20 \n\t"
992        "sub r2, 32 \n\t"
993        "ldmia r1!, {r3,r4,r5} \n\t"
994        "ldmia r2!, {r6,r7,r8} \n\t"
995
996        "umull r11, r12, r3, r6 \n\t"
997        "stmia r0!, {r11} \n\t"
998
999        "mov r10, #0 \n\t"
1000        "umull r11, r9, r3, r7 \n\t"
1001        "adds r12, r11 \n\t"
1002        "adc r9, #0 \n\t"
1003        "umull r11, r14, r4, r6 \n\t"
1004        "adds r12, r11 \n\t"
1005        "adcs r9, r14 \n\t"
1006        "adc r10, #0 \n\t"
1007        "stmia r0!, {r12} \n\t"
1008
1009        "mov r11, #0 \n\t"
1010        "umull r12, r14, r3, r8 \n\t"
1011        "adds r9, r12 \n\t"
1012        "adcs r10, r14 \n\t"
1013        "adc r11, #0 \n\t"
1014        "umull r12, r14, r4, r7 \n\t"
1015        "adds r9, r12 \n\t"
1016        "adcs r10, r14 \n\t"
1017        "adc r11, #0 \n\t"
1018        "umull r12, r14, r5, r6 \n\t"
1019        "adds r9, r12 \n\t"
1020        "adcs r10, r14 \n\t"
1021        "adc r11, #0 \n\t"
1022        "stmia r0!, {r9} \n\t"
1023
1024        "ldmia r1!, {r3} \n\t"
1025        "mov r12, #0 \n\t"
1026        "umull r14, r9, r4, r8 \n\t"
1027        "adds r10, r14 \n\t"
1028        "adcs r11, r9 \n\t"
1029        "adc r12, #0 \n\t"
1030        "umull r14, r9, r5, r7 \n\t"
1031        "adds r10, r14 \n\t"
1032        "adcs r11, r9 \n\t"
1033        "adc r12, #0 \n\t"
1034        "umull r14, r9, r3, r6 \n\t"
1035        "adds r10, r14 \n\t"
1036        "adcs r11, r9 \n\t"
1037        "adc r12, #0 \n\t"
1038        "ldr r14, [r0] \n\t"
1039        "adds r10, r14 \n\t"
1040        "adcs r11, #0 \n\t"
1041        "adc r12, #0 \n\t"
1042        "stmia r0!, {r10} \n\t"
1043
1044        "ldmia r1!, {r4} \n\t"
1045        "mov r14, #0 \n\t"
1046        "umull r9, r10, r5, r8 \n\t"
1047        "adds r11, r9 \n\t"
1048        "adcs r12, r10 \n\t"
1049        "adc r14, #0 \n\t"
1050        "umull r9, r10, r3, r7 \n\t"
1051        "adds r11, r9 \n\t"
1052        "adcs r12, r10 \n\t"
1053        "adc r14, #0 \n\t"
1054        "umull r9, r10, r4, r6 \n\t"
1055        "adds r11, r9 \n\t"
1056        "adcs r12, r10 \n\t"
1057        "adc r14, #0 \n\t"
1058        "ldr r9, [r0] \n\t"
1059        "adds r11, r9 \n\t"
1060        "adcs r12, #0 \n\t"
1061        "adc r14, #0 \n\t"
1062        "stmia r0!, {r11} \n\t"
1063
1064        "ldmia r1!, {r5} \n\t"
1065        "mov r9, #0 \n\t"
1066        "umull r10, r11, r3, r8 \n\t"
1067        "adds r12, r10 \n\t"
1068        "adcs r14, r11 \n\t"
1069        "adc r9, #0 \n\t"
1070        "umull r10, r11, r4, r7 \n\t"
1071        "adds r12, r10 \n\t"
1072        "adcs r14, r11 \n\t"
1073        "adc r9, #0 \n\t"
1074        "umull r10, r11, r5, r6 \n\t"
1075        "adds r12, r10 \n\t"
1076        "adcs r14, r11 \n\t"
1077        "adc r9, #0 \n\t"
1078        "ldr r10, [r0] \n\t"
1079        "adds r12, r10 \n\t"
1080        "adcs r14, #0 \n\t"
1081        "adc r9, #0 \n\t"
1082        "stmia r0!, {r12} \n\t"
1083
1084        "ldmia r1!, {r3} \n\t"
1085        "mov r10, #0 \n\t"
1086        "umull r11, r12, r4, r8 \n\t"
1087        "adds r14, r11 \n\t"
1088        "adcs r9, r12 \n\t"
1089        "adc r10, #0 \n\t"
1090        "umull r11, r12, r5, r7 \n\t"
1091        "adds r14, r11 \n\t"
1092        "adcs r9, r12 \n\t"
1093        "adc r10, #0 \n\t"
1094        "umull r11, r12, r3, r6 \n\t"
1095        "adds r14, r11 \n\t"
1096        "adcs r9, r12 \n\t"
1097        "adc r10, #0 \n\t"
1098        "ldr r11, [r0] \n\t"
1099        "adds r14, r11 \n\t"
1100        "adcs r9, #0 \n\t"
1101        "adc r10, #0 \n\t"
1102        "stmia r0!, {r14} \n\t"
1103
1104        "ldmia r1!, {r4} \n\t"
1105        "mov r11, #0 \n\t"
1106        "umull r12, r14, r5, r8 \n\t"
1107        "adds r9, r12 \n\t"
1108        "adcs r10, r14 \n\t"
1109        "adc r11, #0 \n\t"
1110        "umull r12, r14, r3, r7 \n\t"
1111        "adds r9, r12 \n\t"
1112        "adcs r10, r14 \n\t"
1113        "adc r11, #0 \n\t"
1114        "umull r12, r14, r4, r6 \n\t"
1115        "adds r9, r12 \n\t"
1116        "adcs r10, r14 \n\t"
1117        "adc r11, #0 \n\t"
1118        "ldr r12, [r0] \n\t"
1119        "adds r9, r12 \n\t"
1120        "adcs r10, #0 \n\t"
1121        "adc r11, #0 \n\t"
1122        "stmia r0!, {r9} \n\t"
1123
1124        "ldmia r2!, {r6} \n\t"
1125        "mov r12, #0 \n\t"
1126        "umull r14, r9, r5, r6 \n\t"
1127        "adds r10, r14 \n\t"
1128        "adcs r11, r9 \n\t"
1129        "adc r12, #0 \n\t"
1130        "umull r14, r9, r3, r8 \n\t"
1131        "adds r10, r14 \n\t"
1132        "adcs r11, r9 \n\t"
1133        "adc r12, #0 \n\t"
1134        "umull r14, r9, r4, r7 \n\t"
1135        "adds r10, r14 \n\t"
1136        "adcs r11, r9 \n\t"
1137        "adc r12, #0 \n\t"
1138        "ldr r14, [r0] \n\t"
1139        "adds r10, r14 \n\t"
1140        "adcs r11, #0 \n\t"
1141        "adc r12, #0 \n\t"
1142        "stmia r0!, {r10} \n\t"
1143
1144        "ldmia r2!, {r7} \n\t"
1145        "mov r14, #0 \n\t"
1146        "umull r9, r10, r5, r7 \n\t"
1147        "adds r11, r9 \n\t"
1148        "adcs r12, r10 \n\t"
1149        "adc r14, #0 \n\t"
1150        "umull r9, r10, r3, r6 \n\t"
1151        "adds r11, r9 \n\t"
1152        "adcs r12, r10 \n\t"
1153        "adc r14, #0 \n\t"
1154        "umull r9, r10, r4, r8 \n\t"
1155        "adds r11, r9 \n\t"
1156        "adcs r12, r10 \n\t"
1157        "adc r14, #0 \n\t"
1158        "ldr r9, [r0] \n\t"
1159        "adds r11, r9 \n\t"
1160        "adcs r12, #0 \n\t"
1161        "adc r14, #0 \n\t"
1162        "stmia r0!, {r11} \n\t"
1163
1164        "ldmia r2!, {r8} \n\t"
1165        "mov r9, #0 \n\t"
1166        "umull r10, r11, r5, r8 \n\t"
1167        "adds r12, r10 \n\t"
1168        "adcs r14, r11 \n\t"
1169        "adc r9, #0 \n\t"
1170        "umull r10, r11, r3, r7 \n\t"
1171        "adds r12, r10 \n\t"
1172        "adcs r14, r11 \n\t"
1173        "adc r9, #0 \n\t"
1174        "umull r10, r11, r4, r6 \n\t"
1175        "adds r12, r10 \n\t"
1176        "adcs r14, r11 \n\t"
1177        "adc r9, #0 \n\t"
1178        "ldr r10, [r0] \n\t"
1179        "adds r12, r10 \n\t"
1180        "adcs r14, #0 \n\t"
1181        "adc r9, #0 \n\t"
1182        "stmia r0!, {r12} \n\t"
1183
1184        "ldmia r2!, {r6} \n\t"
1185        "mov r10, #0 \n\t"
1186        "umull r11, r12, r5, r6 \n\t"
1187        "adds r14, r11 \n\t"
1188        "adcs r9, r12 \n\t"
1189        "adc r10, #0 \n\t"
1190        "umull r11, r12, r3, r8 \n\t"
1191        "adds r14, r11 \n\t"
1192        "adcs r9, r12 \n\t"
1193        "adc r10, #0 \n\t"
1194        "umull r11, r12, r4, r7 \n\t"
1195        "adds r14, r11 \n\t"
1196        "adcs r9, r12 \n\t"
1197        "adc r10, #0 \n\t"
1198        "ldr r11, [r0] \n\t"
1199        "adds r14, r11 \n\t"
1200        "adcs r9, #0 \n\t"
1201        "adc r10, #0 \n\t"
1202        "stmia r0!, {r14} \n\t"
1203
1204        "ldmia r2!, {r7} \n\t"
1205        "mov r11, #0 \n\t"
1206        "umull r12, r14, r5, r7 \n\t"
1207        "adds r9, r12 \n\t"
1208        "adcs r10, r14 \n\t"
1209        "adc r11, #0 \n\t"
1210        "umull r12, r14, r3, r6 \n\t"
1211        "adds r9, r12 \n\t"
1212        "adcs r10, r14 \n\t"
1213        "adc r11, #0 \n\t"
1214        "umull r12, r14, r4, r8 \n\t"
1215        "adds r9, r12 \n\t"
1216        "adcs r10, r14 \n\t"
1217        "adc r11, #0 \n\t"
1218        "ldr r12, [r0] \n\t"
1219        "adds r9, r12 \n\t"
1220        "adcs r10, #0 \n\t"
1221        "adc r11, #0 \n\t"
1222        "stmia r0!, {r9} \n\t"
1223
1224        "mov r12, #0 \n\t"
1225        "umull r14, r9, r3, r7 \n\t"
1226        "adds r10, r14 \n\t"
1227        "adcs r11, r9 \n\t"
1228        "adc r12, #0 \n\t"
1229        "umull r14, r9, r4, r6 \n\t"
1230        "adds r10, r14 \n\t"
1231        "adcs r11, r9 \n\t"
1232        "adc r12, #0 \n\t"
1233        "stmia r0!, {r10} \n\t"
1234
1235        "umull r9, r10, r4, r7 \n\t"
1236        "adds r11, r9 \n\t"
1237        "adc r12, r10 \n\t"
1238        "stmia r0!, {r11, r12} \n\t"
1239    #if (uECC_PLATFORM != uECC_arm_thumb2)
1240        ".syntax divided \n\t"
1241    #endif
1242        : "+r" (r0), "+r" (r1), "+r" (r2)
1243        :
1244        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
1245    );
1246}
1247#define asm_mult 1
1248#endif /* (uECC_WORDS == 8) */
1249
1250#if uECC_SQUARE_FUNC
1251#if (uECC_WORDS == 5)
1252static void vli_square(uint32_t *result, const uint32_t *left) {
1253    register uint32_t *r0 __asm__("r0") = result;
1254    register const uint32_t *r1 __asm__("r1") = left;
1255
1256    __asm__ volatile (
1257        ".syntax unified \n\t"
1258        "ldmia r1!, {r2,r3,r4,r5,r6} \n\t"
1259
1260        "umull r11, r12, r2, r2 \n\t"
1261        "stmia r0!, {r11} \n\t"
1262
1263        "mov r9, #0 \n\t"
1264        "umull r10, r11, r2, r3 \n\t"
1265        "adds r12, r10 \n\t"
1266        "adcs r8, r11, #0 \n\t"
1267        "adc r9, #0 \n\t"
1268        "adds r12, r10 \n\t"
1269        "adcs r8, r11 \n\t"
1270        "adc r9, #0 \n\t"
1271        "stmia r0!, {r12} \n\t"
1272
1273        "mov r10, #0 \n\t"
1274        "umull r11, r12, r2, r4 \n\t"
1275        "adds r11, r11 \n\t"
1276        "adcs r12, r12 \n\t"
1277        "adc r10, #0 \n\t"
1278        "adds r8, r11 \n\t"
1279        "adcs r9, r12 \n\t"
1280        "adc r10, #0 \n\t"
1281        "umull r11, r12, r3, r3 \n\t"
1282        "adds r8, r11 \n\t"
1283        "adcs r9, r12 \n\t"
1284        "adc r10, #0 \n\t"
1285        "stmia r0!, {r8} \n\t"
1286
1287        "mov r12, #0 \n\t"
1288        "umull r8, r11, r2, r5 \n\t"
1289        "umull r1, r14, r3, r4 \n\t"
1290        "adds r8, r1 \n\t"
1291        "adcs r11, r14 \n\t"
1292        "adc r12, #0 \n\t"
1293        "adds r8, r8 \n\t"
1294        "adcs r11, r11 \n\t"
1295        "adc r12, r12 \n\t"
1296        "adds r8, r9 \n\t"
1297        "adcs r11, r10 \n\t"
1298        "adc r12, #0 \n\t"
1299        "stmia r0!, {r8} \n\t"
1300
1301        "mov r10, #0 \n\t"
1302        "umull r8, r9, r2, r6 \n\t"
1303        "umull r1, r14, r3, r5 \n\t"
1304        "adds r8, r1 \n\t"
1305        "adcs r9, r14 \n\t"
1306        "adc r10, #0 \n\t"
1307        "adds r8, r8 \n\t"
1308        "adcs r9, r9 \n\t"
1309        "adc r10, r10 \n\t"
1310        "umull r1, r14, r4, r4 \n\t"
1311        "adds r8, r1 \n\t"
1312        "adcs r9, r14 \n\t"
1313        "adc r10, #0 \n\t"
1314        "adds r8, r11 \n\t"
1315        "adcs r9, r12 \n\t"
1316        "adc r10, #0 \n\t"
1317        "stmia r0!, {r8} \n\t"
1318
1319        "mov r12, #0 \n\t"
1320        "umull r8, r11, r3, r6 \n\t"
1321        "umull r1, r14, r4, r5 \n\t"
1322        "adds r8, r1 \n\t"
1323        "adcs r11, r14 \n\t"
1324        "adc r12, #0 \n\t"
1325        "adds r8, r8 \n\t"
1326        "adcs r11, r11 \n\t"
1327        "adc r12, r12 \n\t"
1328        "adds r8, r9 \n\t"
1329        "adcs r11, r10 \n\t"
1330        "adc r12, #0 \n\t"
1331        "stmia r0!, {r8} \n\t"
1332
1333        "mov r8, #0 \n\t"
1334        "umull r1, r10, r4, r6 \n\t"
1335        "adds r1, r1 \n\t"
1336        "adcs r10, r10 \n\t"
1337        "adc r8, #0 \n\t"
1338        "adds r11, r1 \n\t"
1339        "adcs r12, r10 \n\t"
1340        "adc r8, #0 \n\t"
1341        "umull r1, r10, r5, r5 \n\t"
1342        "adds r11, r1 \n\t"
1343        "adcs r12, r10 \n\t"
1344        "adc r8, #0 \n\t"
1345        "stmia r0!, {r11} \n\t"
1346
1347        "mov r11, #0 \n\t"
1348        "umull r1, r10, r5, r6 \n\t"
1349        "adds r1, r1 \n\t"
1350        "adcs r10, r10 \n\t"
1351        "adc r11, #0 \n\t"
1352        "adds r12, r1 \n\t"
1353        "adcs r8, r10 \n\t"
1354        "adc r11, #0 \n\t"
1355        "stmia r0!, {r12} \n\t"
1356
1357        "umull r1, r10, r6, r6 \n\t"
1358        "adds r8, r1 \n\t"
1359        "adcs r11, r10 \n\t"
1360        "stmia r0!, {r8, r11} \n\t"
1361    #if (uECC_PLATFORM != uECC_arm_thumb2)
1362        ".syntax divided \n\t"
1363    #endif
1364        : "+r" (r0), "+r" (r1)
1365        :
1366        : "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
1367    );
1368}
1369#define asm_square 1
1370#endif /* (uECC_WORDS == 5) */
1371
1372#if (uECC_WORDS == 6)
1373static void vli_square(uint32_t *result, const uint32_t *left) {
1374    register uint32_t *r0 __asm__("r0") = result;
1375    register const uint32_t *r1 __asm__("r1") = left;
1376
1377    __asm__ volatile (
1378        ".syntax unified \n\t"
1379        "ldmia r1!, {r2,r3,r4,r5,r6,r7} \n\t"
1380
1381        "umull r11, r12, r2, r2 \n\t"
1382        "stmia r0!, {r11} \n\t"
1383
1384        "mov r9, #0 \n\t"
1385        "umull r10, r11, r2, r3 \n\t"
1386        "adds r12, r10 \n\t"
1387        "adcs r8, r11, #0 \n\t"
1388        "adc r9, #0 \n\t"
1389        "adds r12, r10 \n\t"
1390        "adcs r8, r11 \n\t"
1391        "adc r9, #0 \n\t"
1392        "stmia r0!, {r12} \n\t"
1393
1394        "mov r10, #0 \n\t"
1395        "umull r11, r12, r2, r4 \n\t"
1396        "adds r11, r11 \n\t"
1397        "adcs r12, r12 \n\t"
1398        "adc r10, #0 \n\t"
1399        "adds r8, r11 \n\t"
1400        "adcs r9, r12 \n\t"
1401        "adc r10, #0 \n\t"
1402        "umull r11, r12, r3, r3 \n\t"
1403        "adds r8, r11 \n\t"
1404        "adcs r9, r12 \n\t"
1405        "adc r10, #0 \n\t"
1406        "stmia r0!, {r8} \n\t"
1407
1408        "mov r12, #0 \n\t"
1409        "umull r8, r11, r2, r5 \n\t"
1410        "umull r1, r14, r3, r4 \n\t"
1411        "adds r8, r1 \n\t"
1412        "adcs r11, r14 \n\t"
1413        "adc r12, #0 \n\t"
1414        "adds r8, r8 \n\t"
1415        "adcs r11, r11 \n\t"
1416        "adc r12, r12 \n\t"
1417        "adds r8, r9 \n\t"
1418        "adcs r11, r10 \n\t"
1419        "adc r12, #0 \n\t"
1420        "stmia r0!, {r8} \n\t"
1421
1422        "mov r10, #0 \n\t"
1423        "umull r8, r9, r2, r6 \n\t"
1424        "umull r1, r14, r3, r5 \n\t"
1425        "adds r8, r1 \n\t"
1426        "adcs r9, r14 \n\t"
1427        "adc r10, #0 \n\t"
1428        "adds r8, r8 \n\t"
1429        "adcs r9, r9 \n\t"
1430        "adc r10, r10 \n\t"
1431        "umull r1, r14, r4, r4 \n\t"
1432        "adds r8, r1 \n\t"
1433        "adcs r9, r14 \n\t"
1434        "adc r10, #0 \n\t"
1435        "adds r8, r11 \n\t"
1436        "adcs r9, r12 \n\t"
1437        "adc r10, #0 \n\t"
1438        "stmia r0!, {r8} \n\t"
1439
1440        "mov r12, #0 \n\t"
1441        "umull r8, r11, r2, r7 \n\t"
1442        "umull r1, r14, r3, r6 \n\t"
1443        "adds r8, r1 \n\t"
1444        "adcs r11, r14 \n\t"
1445        "adc r12, #0 \n\t"
1446        "umull r1, r14, r4, r5 \n\t"
1447        "adds r8, r1 \n\t"
1448        "adcs r11, r14 \n\t"
1449        "adc r12, #0 \n\t"
1450        "adds r8, r8 \n\t"
1451        "adcs r11, r11 \n\t"
1452        "adc r12, r12 \n\t"
1453        "adds r8, r9 \n\t"
1454        "adcs r11, r10 \n\t"
1455        "adc r12, #0 \n\t"
1456        "stmia r0!, {r8} \n\t"
1457
1458        "mov r10, #0 \n\t"
1459        "umull r8, r9, r3, r7 \n\t"
1460        "umull r1, r14, r4, r6 \n\t"
1461        "adds r8, r1 \n\t"
1462        "adcs r9, r14 \n\t"
1463        "adc r10, #0 \n\t"
1464        "adds r8, r8 \n\t"
1465        "adcs r9, r9 \n\t"
1466        "adc r10, r10 \n\t"
1467        "umull r1, r14, r5, r5 \n\t"
1468        "adds r8, r1 \n\t"
1469        "adcs r9, r14 \n\t"
1470        "adc r10, #0 \n\t"
1471        "adds r8, r11 \n\t"
1472        "adcs r9, r12 \n\t"
1473        "adc r10, #0 \n\t"
1474        "stmia r0!, {r8} \n\t"
1475
1476        "mov r12, #0 \n\t"
1477        "umull r8, r11, r4, r7 \n\t"
1478        "umull r1, r14, r5, r6 \n\t"
1479        "adds r8, r1 \n\t"
1480        "adcs r11, r14 \n\t"
1481        "adc r12, #0 \n\t"
1482        "adds r8, r8 \n\t"
1483        "adcs r11, r11 \n\t"
1484        "adc r12, r12 \n\t"
1485        "adds r8, r9 \n\t"
1486        "adcs r11, r10 \n\t"
1487        "adc r12, #0 \n\t"
1488        "stmia r0!, {r8} \n\t"
1489
1490        "mov r8, #0 \n\t"
1491        "umull r1, r10, r5, r7 \n\t"
1492        "adds r1, r1 \n\t"
1493        "adcs r10, r10 \n\t"
1494        "adc r8, #0 \n\t"
1495        "adds r11, r1 \n\t"
1496        "adcs r12, r10 \n\t"
1497        "adc r8, #0 \n\t"
1498        "umull r1, r10, r6, r6 \n\t"
1499        "adds r11, r1 \n\t"
1500        "adcs r12, r10 \n\t"
1501        "adc r8, #0 \n\t"
1502        "stmia r0!, {r11} \n\t"
1503
1504        "mov r11, #0 \n\t"
1505        "umull r1, r10, r6, r7 \n\t"
1506        "adds r1, r1 \n\t"
1507        "adcs r10, r10 \n\t"
1508        "adc r11, #0 \n\t"
1509        "adds r12, r1 \n\t"
1510        "adcs r8, r10 \n\t"
1511        "adc r11, #0 \n\t"
1512        "stmia r0!, {r12} \n\t"
1513
1514        "umull r1, r10, r7, r7 \n\t"
1515        "adds r8, r1 \n\t"
1516        "adcs r11, r10 \n\t"
1517        "stmia r0!, {r8, r11} \n\t"
1518    #if (uECC_PLATFORM != uECC_arm_thumb2)
1519        ".syntax divided \n\t"
1520    #endif
1521        : "+r" (r0), "+r" (r1)
1522        :
1523        : "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
1524    );
1525}
1526#define asm_square 1
1527#endif /* (uECC_WORDS == 6) */
1528
1529#if (uECC_WORDS == 7)
1530static void vli_square(uint32_t *result, const uint32_t *left) {
1531    register uint32_t *r0 __asm__("r0") = result;
1532    register const uint32_t *r1 __asm__("r1") = left;
1533
1534    __asm__ volatile (
1535        ".syntax unified \n\t"
1536        "ldmia r1!, {r2} \n\t"
1537        "add r1, 20 \n\t"
1538        "ldmia r1!, {r5} \n\t"
1539        "add r0, 24 \n\t"
1540        "umull r8, r9, r2, r5 \n\t"
1541        "stmia r0!, {r8, r9} \n\t"
1542        "sub r0, 32 \n\t"
1543        "sub r1, 28 \n\t"
1544
1545        "ldmia r1!, {r2, r3, r4, r5, r6, r7} \n\t"
1546
1547        "umull r11, r12, r2, r2 \n\t"
1548        "stmia r0!, {r11} \n\t"
1549
1550        "mov r9, #0 \n\t"
1551        "umull r10, r11, r2, r3 \n\t"
1552        "adds r12, r10 \n\t"
1553        "adcs r8, r11, #0 \n\t"
1554        "adc r9, #0 \n\t"
1555        "adds r12, r10 \n\t"
1556        "adcs r8, r11 \n\t"
1557        "adc r9, #0 \n\t"
1558        "stmia r0!, {r12} \n\t"
1559
1560        "mov r10, #0 \n\t"
1561        "umull r11, r12, r2, r4 \n\t"
1562        "adds r11, r11 \n\t"
1563        "adcs r12, r12 \n\t"
1564        "adc r10, #0 \n\t"
1565        "adds r8, r11 \n\t"
1566        "adcs r9, r12 \n\t"
1567        "adc r10, #0 \n\t"
1568        "umull r11, r12, r3, r3 \n\t"
1569        "adds r8, r11 \n\t"
1570        "adcs r9, r12 \n\t"
1571        "adc r10, #0 \n\t"
1572        "stmia r0!, {r8} \n\t"
1573
1574        "mov r12, #0 \n\t"
1575        "umull r8, r11, r2, r5 \n\t"
1576        "mov r14, r11 \n\t"
1577        "umlal r8, r11, r3, r4 \n\t"
1578        "cmp r14, r11 \n\t"
1579        "it hi \n\t"
1580        "adchi r12, #0 \n\t"
1581        "adds r8, r8 \n\t"
1582        "adcs r11, r11 \n\t"
1583        "adc r12, r12 \n\t"
1584        "adds r8, r9 \n\t"
1585        "adcs r11, r10 \n\t"
1586        "adc r12, #0 \n\t"
1587        "stmia r0!, {r8} \n\t"
1588
1589        "mov r10, #0 \n\t"
1590        "umull r8, r9, r2, r6 \n\t"
1591        "mov r14, r9 \n\t"
1592        "umlal r8, r9, r3, r5 \n\t"
1593        "cmp r14, r9 \n\t"
1594        "it hi \n\t"
1595        "adchi r10, #0 \n\t"
1596        "adds r8, r8 \n\t"
1597        "adcs r9, r9 \n\t"
1598        "adc r10, r10 \n\t"
1599        "mov r14, r9 \n\t"
1600        "umlal r8, r9, r4, r4 \n\t"
1601        "cmp r14, r9 \n\t"
1602        "it hi \n\t"
1603        "adchi r10, #0 \n\t"
1604        "adds r8, r11 \n\t"
1605        "adcs r9, r12 \n\t"
1606        "adc r10, #0 \n\t"
1607        "stmia r0!, {r8} \n\t"
1608
1609        "mov r12, #0 \n\t"
1610        "umull r8, r11, r2, r7 \n\t"
1611        "mov r14, r11 \n\t"
1612        "umlal r8, r11, r3, r6 \n\t"
1613        "cmp r14, r11 \n\t"
1614        "it hi \n\t"
1615        "adchi r12, #0 \n\t"
1616        "mov r14, r11 \n\t"
1617        "umlal r8, r11, r4, r5 \n\t"
1618        "cmp r14, r11 \n\t"
1619        "it hi \n\t"
1620        "adchi r12, #0 \n\t"
1621        "adds r8, r8 \n\t"
1622        "adcs r11, r11 \n\t"
1623        "adc r12, r12 \n\t"
1624        "adds r8, r9 \n\t"
1625        "adcs r11, r10 \n\t"
1626        "adc r12, #0 \n\t"
1627        "stmia r0!, {r8} \n\t"
1628
1629        "ldmia r1!, {r2} \n\t"
1630        "mov r10, #0 \n\t"
1631        "umull r8, r9, r3, r7 \n\t"
1632        "mov r14, r9 \n\t"
1633        "umlal r8, r9, r4, r6 \n\t"
1634        "cmp r14, r9 \n\t"
1635        "it hi \n\t"
1636        "adchi r10, #0 \n\t"
1637        "ldr r14, [r0] \n\t"
1638        "adds r8, r14 \n\t"
1639        "adcs r9, #0 \n\t"
1640        "adc r10, #0 \n\t"
1641        "adds r8, r8 \n\t"
1642        "adcs r9, r9 \n\t"
1643        "adc r10, r10 \n\t"
1644        "mov r14, r9 \n\t"
1645        "umlal r8, r9, r5, r5 \n\t"
1646        "cmp r14, r9 \n\t"
1647        "it hi \n\t"
1648        "adchi r10, #0 \n\t"
1649        "adds r8, r11 \n\t"
1650        "adcs r9, r12 \n\t"
1651        "adc r10, #0 \n\t"
1652        "stmia r0!, {r8} \n\t"
1653
1654        "mov r12, #0 \n\t"
1655        "umull r8, r11, r3, r2 \n\t"
1656        "mov r14, r11 \n\t"
1657        "umlal r8, r11, r4, r7 \n\t"
1658        "cmp r14, r11 \n\t"
1659        "it hi \n\t"
1660        "adchi r12, #0 \n\t"
1661        "mov r14, r11 \n\t"
1662        "umlal r8, r11, r5, r6 \n\t"
1663        "cmp r14, r11 \n\t"
1664        "it hi \n\t"
1665        "adchi r12, #0 \n\t"
1666        "ldr r14, [r0] \n\t"
1667        "adds r8, r14 \n\t"
1668        "adcs r11, #0 \n\t"
1669        "adc r12, #0 \n\t"
1670        "adds r8, r8 \n\t"
1671        "adcs r11, r11 \n\t"
1672        "adc r12, r12 \n\t"
1673        "adds r8, r9 \n\t"
1674        "adcs r11, r10 \n\t"
1675        "adc r12, #0 \n\t"
1676        "stmia r0!, {r8} \n\t"
1677
1678        "mov r10, #0 \n\t"
1679        "umull r8, r9, r4, r2 \n\t"
1680        "mov r14, r9 \n\t"
1681        "umlal r8, r9, r5, r7 \n\t"
1682        "cmp r14, r9 \n\t"
1683        "it hi \n\t"
1684        "adchi r10, #0 \n\t"
1685        "adds r8, r8 \n\t"
1686        "adcs r9, r9 \n\t"
1687        "adc r10, r10 \n\t"
1688        "mov r14, r9 \n\t"
1689        "umlal r8, r9, r6, r6 \n\t"
1690        "cmp r14, r9 \n\t"
1691        "it hi \n\t"
1692        "adchi r10, #0 \n\t"
1693        "adds r8, r11 \n\t"
1694        "adcs r9, r12 \n\t"
1695        "adc r10, #0 \n\t"
1696        "stmia r0!, {r8} \n\t"
1697
1698        "mov r12, #0 \n\t"
1699        "umull r8, r11, r5, r2 \n\t"
1700        "mov r14, r11 \n\t"
1701        "umlal r8, r11, r6, r7 \n\t"
1702        "cmp r14, r11 \n\t"
1703        "it hi \n\t"
1704        "adchi r12, #0 \n\t"
1705        "adds r8, r8 \n\t"
1706        "adcs r11, r11 \n\t"
1707        "adc r12, r12 \n\t"
1708        "adds r8, r9 \n\t"
1709        "adcs r11, r10 \n\t"
1710        "adc r12, #0 \n\t"
1711        "stmia r0!, {r8} \n\t"
1712
1713        "mov r8, #0 \n\t"
1714        "umull r1, r10, r6, r2 \n\t"
1715        "adds r1, r1 \n\t"
1716        "adcs r10, r10 \n\t"
1717        "adc r8, #0 \n\t"
1718        "adds r11, r1 \n\t"
1719        "adcs r12, r10 \n\t"
1720        "adc r8, #0 \n\t"
1721        "umull r1, r10, r7, r7 \n\t"
1722        "adds r11, r1 \n\t"
1723        "adcs r12, r10 \n\t"
1724        "adc r8, #0 \n\t"
1725        "stmia r0!, {r11} \n\t"
1726
1727        "mov r11, #0 \n\t"
1728        "umull r1, r10, r7, r2 \n\t"
1729        "adds r1, r1 \n\t"
1730        "adcs r10, r10 \n\t"
1731        "adc r11, #0 \n\t"
1732        "adds r12, r1 \n\t"
1733        "adcs r8, r10 \n\t"
1734        "adc r11, #0 \n\t"
1735        "stmia r0!, {r12} \n\t"
1736
1737        "umull r1, r10, r2, r2 \n\t"
1738        "adds r8, r1 \n\t"
1739        "adcs r11, r10 \n\t"
1740        "stmia r0!, {r8, r11} \n\t"
1741    #if (uECC_PLATFORM != uECC_arm_thumb2)
1742        ".syntax divided \n\t"
1743    #endif
1744        : "+r" (r0), "+r" (r1)
1745        :
1746        : "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
1747    );
1748}
1749#define asm_square 1
1750#endif /* (uECC_WORDS == 7) */
1751
1752#if (uECC_WORDS == 8)
1753static void vli_square(uint32_t *result, const uint32_t *left) {
1754    register uint32_t *r0 __asm__("r0") = result;
1755    register const uint32_t *r1 __asm__("r1") = left;
1756
1757    __asm__ volatile (
1758        ".syntax unified \n\t"
1759        "ldmia r1!, {r2, r3} \n\t"
1760        "add r1, 16 \n\t"
1761        "ldmia r1!, {r5, r6} \n\t"
1762        "add r0, 24 \n\t"
1763
1764        "umull r8, r9, r2, r5 \n\t"
1765        "stmia r0!, {r8} \n\t"
1766
1767        "umull r12, r10, r2, r6 \n\t"
1768        "adds r9, r12 \n\t"
1769        "adc r10, #0 \n\t"
1770        "stmia r0!, {r9} \n\t"
1771
1772        "umull r8, r9, r3, r6 \n\t"
1773        "adds r10, r8 \n\t"
1774        "adc r11, r9, #0 \n\t"
1775        "stmia r0!, {r10, r11} \n\t"
1776
1777        "sub r0, 40 \n\t"
1778        "sub r1, 32 \n\t"
1779        "ldmia r1!, {r2,r3,r4,r5,r6,r7} \n\t"
1780
1781        "umull r11, r12, r2, r2 \n\t"
1782        "stmia r0!, {r11} \n\t"
1783
1784        "mov r9, #0 \n\t"
1785        "umull r10, r11, r2, r3 \n\t"
1786        "adds r12, r10 \n\t"
1787        "adcs r8, r11, #0 \n\t"
1788        "adc r9, #0 \n\t"
1789        "adds r12, r10 \n\t"
1790        "adcs r8, r11 \n\t"
1791        "adc r9, #0 \n\t"
1792        "stmia r0!, {r12} \n\t"
1793
1794        "mov r10, #0 \n\t"
1795        "umull r11, r12, r2, r4 \n\t"
1796        "adds r11, r11 \n\t"
1797        "adcs r12, r12 \n\t"
1798        "adc r10, #0 \n\t"
1799        "adds r8, r11 \n\t"
1800        "adcs r9, r12 \n\t"
1801        "adc r10, #0 \n\t"
1802        "umull r11, r12, r3, r3 \n\t"
1803        "adds r8, r11 \n\t"
1804        "adcs r9, r12 \n\t"
1805        "adc r10, #0 \n\t"
1806        "stmia r0!, {r8} \n\t"
1807
1808        "mov r12, #0 \n\t"
1809        "umull r8, r11, r2, r5 \n\t"
1810        "mov r14, r11 \n\t"
1811        "umlal r8, r11, r3, r4 \n\t"
1812        "cmp r14, r11 \n\t"
1813        "it hi \n\t"
1814        "adchi r12, #0 \n\t"
1815        "adds r8, r8 \n\t"
1816        "adcs r11, r11 \n\t"
1817        "adc r12, r12 \n\t"
1818        "adds r8, r9 \n\t"
1819        "adcs r11, r10 \n\t"
1820        "adc r12, #0 \n\t"
1821        "stmia r0!, {r8} \n\t"
1822
1823        "mov r10, #0 \n\t"
1824        "umull r8, r9, r2, r6 \n\t"
1825        "mov r14, r9 \n\t"
1826        "umlal r8, r9, r3, r5 \n\t"
1827        "cmp r14, r9 \n\t"
1828        "it hi \n\t"
1829        "adchi r10, #0 \n\t"
1830        "adds r8, r8 \n\t"
1831        "adcs r9, r9 \n\t"
1832        "adc r10, r10 \n\t"
1833        "mov r14, r9 \n\t"
1834        "umlal r8, r9, r4, r4 \n\t"
1835        "cmp r14, r9 \n\t"
1836        "it hi \n\t"
1837        "adchi r10, #0 \n\t"
1838        "adds r8, r11 \n\t"
1839        "adcs r9, r12 \n\t"
1840        "adc r10, #0 \n\t"
1841        "stmia r0!, {r8} \n\t"
1842
1843        "mov r12, #0 \n\t"
1844        "umull r8, r11, r2, r7 \n\t"
1845        "mov r14, r11 \n\t"
1846        "umlal r8, r11, r3, r6 \n\t"
1847        "cmp r14, r11 \n\t"
1848        "it hi \n\t"
1849        "adchi r12, #0 \n\t"
1850        "mov r14, r11 \n\t"
1851        "umlal r8, r11, r4, r5 \n\t"
1852        "cmp r14, r11 \n\t"
1853        "it hi \n\t"
1854        "adchi r12, #0 \n\t"
1855        "adds r8, r8 \n\t"
1856        "adcs r11, r11 \n\t"
1857        "adc r12, r12 \n\t"
1858        "adds r8, r9 \n\t"
1859        "adcs r11, r10 \n\t"
1860        "adc r12, #0 \n\t"
1861        "stmia r0!, {r8} \n\t"
1862
1863        "ldmia r1!, {r2} \n\t"
1864        "mov r10, #0 \n\t"
1865        "umull r8, r9, r3, r7 \n\t"
1866        "mov r14, r9 \n\t"
1867        "umlal r8, r9, r4, r6 \n\t"
1868        "cmp r14, r9 \n\t"
1869        "it hi \n\t"
1870        "adchi r10, #0 \n\t"
1871        "ldr r14, [r0] \n\t"
1872        "adds r8, r14 \n\t"
1873        "adcs r9, #0 \n\t"
1874        "adc r10, #0 \n\t"
1875        "adds r8, r8 \n\t"
1876        "adcs r9, r9 \n\t"
1877        "adc r10, r10 \n\t"
1878        "mov r14, r9 \n\t"
1879        "umlal r8, r9, r5, r5 \n\t"
1880        "cmp r14, r9 \n\t"
1881        "it hi \n\t"
1882        "adchi r10, #0 \n\t"
1883        "adds r8, r11 \n\t"
1884        "adcs r9, r12 \n\t"
1885        "adc r10, #0 \n\t"
1886        "stmia r0!, {r8} \n\t"
1887
1888        "mov r12, #0 \n\t"
1889        "umull r8, r11, r3, r2 \n\t"
1890        "mov r14, r11 \n\t"
1891        "umlal r8, r11, r4, r7 \n\t"
1892        "cmp r14, r11 \n\t"
1893        "it hi \n\t"
1894        "adchi r12, #0 \n\t"
1895        "mov r14, r11 \n\t"
1896        "umlal r8, r11, r5, r6 \n\t"
1897        "cmp r14, r11 \n\t"
1898        "it hi \n\t"
1899        "adchi r12, #0 \n\t"
1900        "ldr r14, [r0] \n\t"
1901        "adds r8, r14 \n\t"
1902        "adcs r11, #0 \n\t"
1903        "adc r12, #0 \n\t"
1904        "adds r8, r8 \n\t"
1905        "adcs r11, r11 \n\t"
1906        "adc r12, r12 \n\t"
1907        "adds r8, r9 \n\t"
1908        "adcs r11, r10 \n\t"
1909        "adc r12, #0 \n\t"
1910        "stmia r0!, {r8} \n\t"
1911
1912        "ldmia r1!, {r3} \n\t"
1913        "mov r10, #0 \n\t"
1914        "umull r8, r9, r4, r2 \n\t"
1915        "mov r14, r9 \n\t"
1916        "umlal r8, r9, r5, r7 \n\t"
1917        "cmp r14, r9 \n\t"
1918        "it hi \n\t"
1919        "adchi r10, #0 \n\t"
1920        "ldr r14, [r0] \n\t"
1921        "adds r8, r14 \n\t"
1922        "adcs r9, #0 \n\t"
1923        "adc r10, #0 \n\t"
1924        "adds r8, r8 \n\t"
1925        "adcs r9, r9 \n\t"
1926        "adc r10, r10 \n\t"
1927        "mov r14, r9 \n\t"
1928        "umlal r8, r9, r6, r6 \n\t"
1929        "cmp r14, r9 \n\t"
1930        "it hi \n\t"
1931        "adchi r10, #0 \n\t"
1932        "adds r8, r11 \n\t"
1933        "adcs r9, r12 \n\t"
1934        "adc r10, #0 \n\t"
1935        "stmia r0!, {r8} \n\t"
1936
1937        "mov r12, #0 \n\t"
1938        "umull r8, r11, r4, r3 \n\t"
1939        "mov r14, r11 \n\t"
1940        "umlal r8, r11, r5, r2 \n\t"
1941        "cmp r14, r11 \n\t"
1942        "it hi \n\t"
1943        "adchi r12, #0 \n\t"
1944        "mov r14, r11 \n\t"
1945        "umlal r8, r11, r6, r7 \n\t"
1946        "cmp r14, r11 \n\t"
1947        "it hi \n\t"
1948        "adchi r12, #0 \n\t"
1949        "ldr r14, [r0] \n\t"
1950        "adds r8, r14 \n\t"
1951        "adcs r11, #0 \n\t"
1952        "adc r12, #0 \n\t"
1953        "adds r8, r8 \n\t"
1954        "adcs r11, r11 \n\t"
1955        "adc r12, r12 \n\t"
1956        "adds r8, r9 \n\t"
1957        "adcs r11, r10 \n\t"
1958        "adc r12, #0 \n\t"
1959        "stmia r0!, {r8} \n\t"
1960
1961        "mov r10, #0 \n\t"
1962        "umull r8, r9, r5, r3 \n\t"
1963        "mov r14, r9 \n\t"
1964        "umlal r8, r9, r6, r2 \n\t"
1965        "cmp r14, r9 \n\t"
1966        "it hi \n\t"
1967        "adchi r10, #0 \n\t"
1968        "adds r8, r8 \n\t"
1969        "adcs r9, r9 \n\t"
1970        "adc r10, r10 \n\t"
1971        "mov r14, r9 \n\t"
1972        "umlal r8, r9, r7, r7 \n\t"
1973        "cmp r14, r9 \n\t"
1974        "it hi \n\t"
1975        "adchi r10, #0 \n\t"
1976        "adds r8, r11 \n\t"
1977        "adcs r9, r12 \n\t"
1978        "adc r10, #0 \n\t"
1979        "stmia r0!, {r8} \n\t"
1980
1981        "mov r12, #0 \n\t"
1982        "umull r8, r11, r6, r3 \n\t"
1983        "mov r14, r11 \n\t"
1984        "umlal r8, r11, r7, r2 \n\t"
1985        "cmp r14, r11 \n\t"
1986        "it hi \n\t"
1987        "adchi r12, #0 \n\t"
1988        "adds r8, r8 \n\t"
1989        "adcs r11, r11 \n\t"
1990        "adc r12, r12 \n\t"
1991        "adds r8, r9 \n\t"
1992        "adcs r11, r10 \n\t"
1993        "adc r12, #0 \n\t"
1994        "stmia r0!, {r8} \n\t"
1995
1996        "mov r8, #0 \n\t"
1997        "umull r1, r10, r7, r3 \n\t"
1998        "adds r1, r1 \n\t"
1999        "adcs r10, r10 \n\t"
2000        "adc r8, #0 \n\t"
2001        "adds r11, r1 \n\t"
2002        "adcs r12, r10 \n\t"
2003        "adc r8, #0 \n\t"
2004        "umull r1, r10, r2, r2 \n\t"
2005        "adds r11, r1 \n\t"
2006        "adcs r12, r10 \n\t"
2007        "adc r8, #0 \n\t"
2008        "stmia r0!, {r11} \n\t"
2009
2010        "mov r11, #0 \n\t"
2011        "umull r1, r10, r2, r3 \n\t"
2012        "adds r1, r1 \n\t"
2013        "adcs r10, r10 \n\t"
2014        "adc r11, #0 \n\t"
2015        "adds r12, r1 \n\t"
2016        "adcs r8, r10 \n\t"
2017        "adc r11, #0 \n\t"
2018        "stmia r0!, {r12} \n\t"
2019
2020        "umull r1, r10, r3, r3 \n\t"
2021        "adds r8, r1 \n\t"
2022        "adcs r11, r10 \n\t"
2023        "stmia r0!, {r8, r11} \n\t"
2024    #if (uECC_PLATFORM != uECC_arm_thumb2)
2025        ".syntax divided \n\t"
2026    #endif
2027        : "+r" (r0), "+r" (r1)
2028        :
2029        : "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
2030    );
2031}
2032#define asm_square 1
2033#endif /* (uECC_WORDS == 8) */
2034#endif /* uECC_SQUARE_FUNC */
2035
2036#endif /* (uECC_PLATFORM != uECC_arm_thumb) */
2037#endif /* (uECC_ASM == uECC_asm_fast) */
2038
2039#if !asm_add
2040static uint32_t vli_add(uint32_t *result, const uint32_t *left, const uint32_t *right) {
2041    uint32_t counter = uECC_WORDS;
2042    uint32_t carry = 0;
2043    uint32_t left_word;
2044    uint32_t right_word;
2045
2046    __asm__ volatile (
2047        ".syntax unified \n\t"
2048        "1: \n\t"
2049        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
2050        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
2051        "lsrs %[carry], #1 \n\t"          /* Set up carry flag (carry = 0 after this). */
2052        "adcs %[left], %[right] \n\t"     /* Add with carry. */
2053        "adcs %[carry], %[carry] \n\t"    /* Store carry bit. */
2054        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
2055        "subs %[ctr], #1 \n\t"            /* Decrement counter. */
2056        "bne 1b \n\t"                     /* Loop until counter == 0. */
2057    #if (uECC_PLATFORM != uECC_arm_thumb2)
2058        ".syntax divided \n\t"
2059    #endif
2060    #if (uECC_PLATFORM == uECC_arm_thumb)
2061        : [dptr] "+l" (result), [lptr] "+l" (left), [rptr] "+l" (right),
2062          [ctr] "+l" (counter), [carry] "+l" (carry),
2063          [left] "=l" (left_word), [right] "=l" (right_word)
2064    #else
2065        : [dptr] "+r" (result), [lptr] "+r" (left), [rptr] "+r" (right),
2066          [ctr] "+r" (counter), [carry] "+r" (carry),
2067          [left] "=r" (left_word), [right] "=r" (right_word)
2068    #endif
2069        :
2070        : "cc", "memory"
2071    );
2072    return carry;
2073}
2074#define asm_add 1
2075#endif
2076
2077#if !asm_sub
2078static uint32_t vli_sub(uint32_t *result, const uint32_t *left, const uint32_t *right) {
2079    uint32_t counter = uECC_WORDS;
2080    uint32_t carry = 1; /* carry = 1 initially (means don't borrow) */
2081    uint32_t left_word;
2082    uint32_t right_word;
2083
2084    __asm__ volatile (
2085        ".syntax unified \n\t"
2086        "1: \n\t"
2087        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
2088        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
2089        "lsrs %[carry], #1 \n\t"          /* Set up carry flag (carry = 0 after this). */
2090        "sbcs %[left], %[right] \n\t"     /* Subtract with borrow. */
2091        "adcs %[carry], %[carry] \n\t"    /* Store carry bit. */
2092        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
2093        "subs %[ctr], #1 \n\t"            /* Decrement counter. */
2094        "bne 1b \n\t"                     /* Loop until counter == 0. */
2095    #if (uECC_PLATFORM != uECC_arm_thumb2)
2096        ".syntax divided \n\t"
2097    #endif
2098    #if (uECC_PLATFORM == uECC_arm_thumb)
2099        : [dptr] "+l" (result), [lptr] "+l" (left), [rptr] "+l" (right),
2100          [ctr] "+l" (counter), [carry] "+l" (carry),
2101          [left] "=l" (left_word), [right] "=l" (right_word)
2102    #else
2103        : [dptr] "+r" (result), [lptr] "+r" (left), [rptr] "+r" (right),
2104          [ctr] "+r" (counter), [carry] "+r" (carry),
2105          [left] "=r" (left_word), [right] "=r" (right_word)
2106    #endif
2107        :
2108        : "cc", "memory"
2109    );
2110    return !carry;
2111}
2112#define asm_sub 1
2113#endif
2114
2115#if !asm_mult
2116static void vli_mult(uint32_t *result, const uint32_t *left, const uint32_t *right) {
2117#if (uECC_PLATFORM != uECC_arm_thumb)
2118    uint32_t c0 = 0;
2119    uint32_t c1 = 0;
2120    uint32_t c2 = 0;
2121    uint32_t k = 0;
2122    uint32_t i;
2123    uint32_t t0, t1;
2124
2125    __asm__ volatile (
2126        ".syntax unified \n\t"
2127
2128        "1: \n\t" /* outer loop (k < uECC_WORDS) */
2129        "movs %[i], #0 \n\t" /* i = 0 */
2130        "b 3f \n\t"
2131
2132        "2: \n\t" /* outer loop (k >= uECC_WORDS) */
2133        "movs %[i], %[k] \n\t"      /* i = k */
2134        "subs %[i], %[eccdm1] \n\t" /* i = k - (uECC_WORDS - 1) (times 4) */
2135
2136        "3: \n\t" /* inner loop */
2137        "subs %[t0], %[k], %[i] \n\t" /* t0 = k-i */
2138
2139        "ldr %[t1], [%[right], %[t0]] \n\t" /* t1 = right[k - i] */
2140        "ldr %[t0], [%[left], %[i]] \n\t"   /* t0 = left[i] */
2141
2142        "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = left[i] * right[k - i] */
2143
2144        "adds %[c0], %[t0] \n\t" /* add low word to c0 */
2145        "adcs %[c1], %[t1] \n\t" /* add high word to c1, including carry */
2146        "adcs %[c2], #0 \n\t"    /* add carry to c2 */
2147
2148        "adds %[i], #4 \n\t"     /* i += 4 */
2149        "cmp %[i], %[eccd] \n\t" /* i < uECC_WORDS (times 4)? */
2150        "bge 4f \n\t"            /*   if not, exit the loop */
2151        "cmp %[i], %[k] \n\t"    /* i <= k? */
2152        "ble 3b \n\t"            /*   if so, continue looping */
2153
2154        "4: \n\t" /* end inner loop */
2155
2156        "str %[c0], [%[result], %[k]] \n\t" /* result[k] = c0 */
2157        "mov %[c0], %[c1] \n\t"     /* c0 = c1 */
2158        "mov %[c1], %[c2] \n\t"     /* c1 = c2 */
2159        "movs %[c2], #0 \n\t"       /* c2 = 0 */
2160        "adds %[k], #4 \n\t"        /* k += 4 */
2161        "cmp %[k], %[eccd] \n\t"    /* k < uECC_WORDS (times 4) ? */
2162        "blt 1b \n\t"               /*   if not, loop back, start with i = 0 */
2163        "cmp %[k], %[eccd2m1] \n\t" /* k < uECC_WORDS * 2 - 1 (times 4) ? */
2164        "blt 2b \n\t"               /*   if not, loop back, start with i = (k + 1) - uECC_WORDS */
2165        /* end outer loop */
2166
2167        "str %[c0], [%[result], %[k]] \n\t" /* result[uECC_WORDS * 2 - 1] = c0 */
2168    #if (uECC_PLATFORM != uECC_arm_thumb2)
2169        ".syntax divided \n\t"
2170    #endif
2171        : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2),
2172          [k] "+r" (k), [i] "=&r" (i), [t0] "=&r" (t0), [t1] "=&r" (t1)
2173        : [result] "r" (result), [left] "r" (left), [right] "r" (right),
2174          [eccd] "I" (uECC_WORDS * 4), [eccdm1] "I" ((uECC_WORDS-1) * 4),
2175          [eccd2m1] "I" ((uECC_WORDS * 2 - 1) * 4)
2176        : "cc", "memory"
2177    );
2178
2179#else /* Thumb-1 */
2180
2181    register uint32_t *r0 __asm__("r0") = result;
2182    register const uint32_t *r1 __asm__("r1") = left;
2183    register const uint32_t *r2 __asm__("r2") = right;
2184
2185    __asm__ volatile (
2186        ".syntax unified \n\t"
2187        "movs r3, #0 \n\t" /* c0 = 0 */
2188        "movs r4, #0 \n\t" /* c1 = 0 */
2189        "movs r5, #0 \n\t" /* c2 = 0 */
2190        "movs r6, #0 \n\t" /* k = 0 */
2191
2192        "push {r0} \n\t" /* keep result on the stack */
2193
2194        "1: \n\t" /* outer loop (k < uECC_WORDS) */
2195        "movs r7, #0 \n\t" /* r7 = i = 0 */
2196        "b 3f \n\t"
2197
2198        "2: \n\t" /* outer loop (k >= uECC_WORDS) */
2199        "movs r7, r6 \n\t"        /* r7 = k */
2200        "subs r7, %[eccdm1] \n\t" /* r7 = i = k - (uECC_WORDS - 1) (times 4) */
2201
2202        "3: \n\t" /* inner loop */
2203        "push {r3, r4, r5, r6} \n\t" /* push things, r3 (c0) is at the top of stack. */
2204        "subs r0, r6, r7 \n\t"       /* r0 = k - i */
2205
2206        "ldr r4, [r2, r0] \n\t" /* r4 = right[k - i] */
2207        "ldr r0, [r1, r7] \n\t" /* r0 = left[i] */
2208
2209        "lsrs r3, r0, #16 \n\t" /* r3 = a1 */
2210        "uxth r0, r0 \n\t"      /* r0 = a0 */
2211
2212        "lsrs r5, r4, #16 \n\t" /* r5 = b1 */
2213        "uxth r4, r4 \n\t"      /* r4 = b0 */
2214
2215        "movs r6, r3 \n\t"     /* r6 = a1 */
2216        "muls r6, r5, r6 \n\t" /* r6 = a1 * b1 */
2217        "muls r3, r4, r3 \n\t" /* r3 = b0 * a1 */
2218        "muls r5, r0, r5 \n\t" /* r5 = a0 * b1 */
2219        "muls r0, r4, r0 \n\t" /* r0 = a0 * b0 */
2220
2221        "movs r4, #0 \n\t"  /* r4 = 0 */
2222        "adds r3, r5 \n\t"  /* r3 = b0 * a1 + a0 * b1 */
2223        "adcs r4, r4 \n\t"  /* r4 = carry */
2224        "lsls r4, #16 \n\t" /* r4 = carry << 16 */
2225        "adds r6, r4 \n\t"  /* r6 = a1 * b1 + carry */
2226
2227        "lsls r4, r3, #16 \n\t" /* r4 = (b0 * a1 + a0 * b1) << 16 */
2228        "lsrs r3, #16 \n\t"     /* r3 = (b0 * a1 + a0 * b1) >> 16 */
2229        "adds r0, r4 \n\t"      /* r0 = low word = a0 * b0 + ((b0 * a1 + a0 * b1) << 16) */
2230        "adcs r6, r3 \n\t"      /* r6 = high word = a1 * b1 + carry + ((b0 * a1 + a0 * b1) >> 16) */
2231
2232        "pop {r3, r4, r5} \n\t" /* r3 = c0, r4 = c1, r5 = c2 */
2233        "adds r3, r0 \n\t"      /* add low word to c0 */
2234        "adcs r4, r6 \n\t"      /* add high word to c1, including carry */
2235        "movs r0, #0 \n\t"      /* r0 = 0 (does not affect carry bit) */
2236        "adcs r5, r0 \n\t"      /* add carry to c2 */
2237
2238        "pop {r6} \n\t" /* r6 = k */
2239
2240        "adds r7, #4 \n\t"     /* i += 4 */
2241        "cmp r7, %[eccd] \n\t" /* i < uECC_WORDS (times 4)? */
2242        "bge 4f \n\t"          /*   if not, exit the loop */
2243        "cmp r7, r6 \n\t"      /* i <= k? */
2244        "ble 3b \n\t"          /*   if so, continue looping */
2245
2246        "4: \n\t" /* end inner loop */
2247
2248        "ldr r0, [sp, #0] \n\t" /* r0 = result */
2249
2250        "str r3, [r0, r6] \n\t"   /* result[k] = c0 */
2251        "mov r3, r4 \n\t"         /* c0 = c1 */
2252        "mov r4, r5 \n\t"         /* c1 = c2 */
2253        "movs r5, #0 \n\t"        /* c2 = 0 */
2254        "adds r6, #4 \n\t"        /* k += 4 */
2255        "cmp r6, %[eccd] \n\t"    /* k < uECC_WORDS (times 4) ? */
2256        "blt 1b \n\t"             /*   if not, loop back, start with i = 0 */
2257        "cmp r6, %[eccd2m1] \n\t" /* k < uECC_WORDS * 2 - 1 (times 4) ? */
2258        "blt 2b \n\t"             /*   if not, loop back, start with i = (k + 1) - uECC_WORDS */
2259        /* end outer loop */
2260
2261        "str r3, [r0, r6] \n\t" /* result[uECC_WORDS * 2 - 1] = c0 */
2262        "pop {r0} \n\t"         /* pop result off the stack */
2263
2264        ".syntax divided \n\t"
2265        :
2266        : [r0] "l" (r0), [r1] "l" (r1), [r2] "l" (r2), [eccd] "I" (uECC_WORDS * 4), [eccdm1] "I" ((uECC_WORDS-1) * 4), [eccd2m1] "I" ((uECC_WORDS * 2 - 1) * 4)
2267        : "r3", "r4", "r5", "r6", "r7", "cc", "memory"
2268    );
2269#endif
2270}
2271#define asm_mult 1
2272#endif /* !asm_mult */
2273
2274#if uECC_SQUARE_FUNC
2275#if !asm_square
2276static void vli_square(uint32_t *result, const uint32_t *left) {
2277#if (uECC_PLATFORM != uECC_arm_thumb)
2278    uint32_t c0 = 0;
2279    uint32_t c1 = 0;
2280    uint32_t c2 = 0;
2281    uint32_t k = 0;
2282    uint32_t i, tt;
2283    uint32_t t0, t1;
2284
2285    __asm__ volatile (
2286        ".syntax unified \n\t"
2287
2288        "1: \n\t" /* outer loop (k < uECC_WORDS) */
2289        "movs %[i], #0 \n\t" /* i = 0 */
2290        "b 3f \n\t"
2291
2292        "2: \n\t" /* outer loop (k >= uECC_WORDS) */
2293        "movs %[i], %[k] \n\t"      /* i = k */
2294        "subs %[i], %[eccdm1] \n\t" /* i = k - (uECC_WORDS - 1) (times 4) */
2295
2296        "3: \n\t" /* inner loop */
2297        "subs %[tt], %[k], %[i] \n\t" /* tt = k-i */
2298
2299        "ldr %[t1], [%[left], %[tt]] \n\t" /* t1 = left[k - i] */
2300        "ldr %[t0], [%[left], %[i]] \n\t"  /* t0 = left[i] */
2301
2302        "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = left[i] * right[k - i] */
2303
2304        "cmp %[i], %[tt] \n\t" /* (i < k - i) ? */
2305        "bge 4f \n\t"          /*   if i >= k - i, skip */
2306        "lsls %[t1], #1 \n\t"  /* high word << 1 */
2307        "adc %[c2], #0 \n\t"   /* add carry bit to c2 */
2308        "lsls %[t0], #1 \n\t"  /* low word << 1 */
2309        "adc %[t1], #0 \n\t"   /* add carry bit to high word */
2310
2311        "4: \n\t"
2312
2313        "adds %[c0], %[t0] \n\t" /* add low word to c0 */
2314        "adcs %[c1], %[t1] \n\t" /* add high word to c1, including carry */
2315        "adc %[c2], #0 \n\t"     /* add carry to c2 */
2316
2317        "adds %[i], #4 \n\t"          /* i += 4 */
2318        "cmp %[i], %[k] \n\t"         /* i <= k? */
2319        "bge 5f \n\t"                 /*   if not, exit the loop */
2320        "subs %[tt], %[k], %[i] \n\t" /* tt = k - i */
2321        "cmp %[i], %[tt] \n\t"        /* i <= k - i? */
2322        "ble 3b \n\t"                 /*   if so, continue looping */
2323
2324        "5: \n\t" /* end inner loop */
2325
2326        "str %[c0], [%[result], %[k]] \n\t" /* result[k] = c0 */
2327        "mov %[c0], %[c1] \n\t"     /* c0 = c1 */
2328        "mov %[c1], %[c2] \n\t"     /* c1 = c2 */
2329        "movs %[c2], #0 \n\t"       /* c2 = 0 */
2330        "adds %[k], #4 \n\t"        /* k += 4 */
2331        "cmp %[k], %[eccd] \n\t"    /* k < uECC_WORDS (times 4) ? */
2332        "blt 1b \n\t"               /*   if not, loop back, start with i = 0 */
2333        "cmp %[k], %[eccd2m1] \n\t" /* k < uECC_WORDS * 2 - 1 (times 4) ? */
2334        "blt 2b \n\t"               /*   if not, loop back, start with i = (k + 1) - uECC_WORDS */
2335        /* end outer loop */
2336
2337        "str %[c0], [%[result], %[k]] \n\t" /* result[uECC_WORDS * 2 - 1] = c0 */
2338    #if (uECC_PLATFORM != uECC_arm_thumb2)
2339        ".syntax divided \n\t"
2340    #endif
2341        : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2),
2342          [k] "+r" (k), [i] "=&r" (i), [tt] "=&r" (tt), [t0] "=&r" (t0), [t1] "=&r" (t1)
2343        : [result] "r" (result), [left] "r" (left),
2344          [eccd] "I" (uECC_WORDS * 4), [eccdm1] "I" ((uECC_WORDS-1) * 4),
2345          [eccd2m1] "I" ((uECC_WORDS * 2 - 1) * 4)
2346        : "cc", "memory"
2347    );
2348
2349#else
2350
2351    register uint32_t *r0 __asm__("r0") = result;
2352    register const uint32_t *r1 __asm__("r1") = left;
2353
2354    __asm__ volatile (
2355        ".syntax unified \n\t"
2356        "movs r2, #0 \n\t" /* c0 = 0 */
2357        "movs r3, #0 \n\t" /* c1 = 0 */
2358        "movs r4, #0 \n\t" /* c2 = 0 */
2359        "movs r5, #0 \n\t" /* k = 0 */
2360
2361        "push {r0} \n\t" /* keep result on the stack */
2362
2363        "1: \n\t" /* outer loop (k < uECC_WORDS) */
2364        "movs r6, #0 \n\t" /* r6 = i = 0 */
2365        "b 3f \n\t"
2366
2367        "2: \n\t" /* outer loop (k >= uECC_WORDS) */
2368        "movs r6, r5 \n\t"        /* r6 = k */
2369        "subs r6, %[eccdm1] \n\t" /* r6 = i = k - (uECC_WORDS - 1) (times 4) */
2370
2371        "3: \n\t" /* inner loop */
2372        "push {r2, r3, r4, r5} \n\t" /* push things, r2 (c0) is at the top of stack. */
2373        "subs r7, r5, r6 \n\t"       /* r7 = k - i */
2374
2375        "ldr r3, [r1, r7] \n\t" /* r3 = left[k - i] */
2376        "ldr r0, [r1, r6] \n\t" /* r0 = left[i] */
2377
2378        "lsrs r2, r0, #16 \n\t" /* r2 = a1 */
2379        "uxth r0, r0 \n\t"      /* r0 = a0 */
2380
2381        "lsrs r4, r3, #16 \n\t" /* r4 = b1 */
2382        "uxth r3, r3 \n\t"      /* r3 = b0 */
2383
2384        "movs r5, r2 \n\t"     /* r5 = a1 */
2385        "muls r5, r4, r5 \n\t" /* r5 = a1 * b1 */
2386        "muls r2, r3, r2 \n\t" /* r2 = b0 * a1 */
2387        "muls r4, r0, r4 \n\t" /* r4 = a0 * b1 */
2388        "muls r0, r3, r0 \n\t" /* r0 = a0 * b0 */
2389
2390        "movs r3, #0 \n\t"  /* r3 = 0 */
2391        "adds r2, r4 \n\t"  /* r2 = b0 * a1 + a0 * b1 */
2392        "adcs r3, r3 \n\t"  /* r3 = carry */
2393        "lsls r3, #16 \n\t" /* r3 = carry << 16 */
2394        "adds r5, r3 \n\t"  /* r5 = a1 * b1 + carry */
2395
2396        "lsls r3, r2, #16 \n\t" /* r3 = (b0 * a1 + a0 * b1) << 16 */
2397        "lsrs r2, #16 \n\t"     /* r2 = (b0 * a1 + a0 * b1) >> 16 */
2398        "adds r0, r3 \n\t"      /* r0 = low word = a0 * b0 + ((b0 * a1 + a0 * b1) << 16) */
2399        "adcs r5, r2 \n\t"      /* r5 = high word = a1 * b1 + carry + ((b0 * a1 + a0 * b1) >> 16) */
2400
2401        "movs r3, #0 \n\t"  /* r3 = 0 */
2402        "cmp r6, r7 \n\t"   /* (i < k - i) ? */
2403        "mov r7, r3 \n\t"   /* r7 = 0 (does not affect condition)*/
2404        "bge 4f \n\t"       /*   if i >= k - i, skip */
2405        "lsls r5, #1 \n\t"  /* high word << 1 */
2406        "adcs r7, r3 \n\t"  /* r7 = carry bit for c2 */
2407        "lsls r0, #1 \n\t"  /* low word << 1 */
2408        "adcs r5, r3 \n\t"  /* add carry from shift to high word */
2409
2410        "4: \n\t"
2411        "pop {r2, r3, r4} \n\t" /* r2 = c0, r3 = c1, r4 = c2 */
2412        "adds r2, r0 \n\t"      /* add low word to c0 */
2413        "adcs r3, r5 \n\t"      /* add high word to c1, including carry */
2414        "movs r0, #0 \n\t"      /* r0 = 0 (does not affect carry bit) */
2415        "adcs r4, r0 \n\t"      /* add carry to c2 */
2416        "adds r4, r7 \n\t"      /* add carry from doubling (if any) */
2417
2418        "pop {r5} \n\t" /* r5 = k */
2419
2420        "adds r6, #4 \n\t"     /* i += 4 */
2421        "cmp r6, r5 \n\t"      /* i <= k? */
2422        "bge 5f \n\t"          /*   if not, exit the loop */
2423        "subs r7, r5, r6 \n\t" /* r7 = k - i */
2424        "cmp r6, r7 \n\t"      /* i <= k - i? */
2425        "ble 3b \n\t"          /*   if so, continue looping */
2426
2427        "5: \n\t" /* end inner loop */
2428
2429        "ldr r0, [sp, #0] \n\t" /* r0 = result */
2430
2431        "str r2, [r0, r5] \n\t"   /* result[k] = c0 */
2432        "mov r2, r3 \n\t"         /* c0 = c1 */
2433        "mov r3, r4 \n\t"         /* c1 = c2 */
2434        "movs r4, #0 \n\t"        /* c2 = 0 */
2435        "adds r5, #4 \n\t"        /* k += 4 */
2436        "cmp r5, %[eccd] \n\t"    /* k < uECC_WORDS (times 4) ? */
2437        "blt 1b \n\t"             /*   if not, loop back, start with i = 0 */
2438        "cmp r5, %[eccd2m1] \n\t" /* k < uECC_WORDS * 2 - 1 (times 4) ? */
2439        "blt 2b \n\t"             /*   if not, loop back, start with i = (k + 1) - uECC_WORDS */
2440        /* end outer loop */
2441
2442        "str r2, [r0, r5] \n\t" /* result[uECC_WORDS * 2 - 1] = c0 */
2443        "pop {r0} \n\t"        /* pop result off the stack */
2444
2445        ".syntax divided \n\t"
2446        : [r0] "+l" (r0), [r1] "+l" (r1)
2447        : [eccd] "I" (uECC_WORDS * 4), [eccdm1] "I" ((uECC_WORDS-1) * 4),
2448          [eccd2m1] "I" ((uECC_WORDS * 2 - 1) * 4)
2449        : "r2", "r3", "r4", "r5", "r6", "r7", "cc", "memory"
2450    );
2451#endif
2452}
2453#define asm_square 1
2454#endif /* !asm_square */
2455#endif /* uECC_SQUARE_FUNC */
2456