1*af03003cSMatthias Ringwald#!/usr/bin/env python 2*af03003cSMatthias Ringwald 3*af03003cSMatthias Ringwaldimport sys 4*af03003cSMatthias Ringwald 5*af03003cSMatthias Ringwaldif len(sys.argv) < 2: 6*af03003cSMatthias Ringwald print "Provide the integer size in 32-bit words" 7*af03003cSMatthias Ringwald sys.exit(1) 8*af03003cSMatthias Ringwald 9*af03003cSMatthias Ringwaldsize = int(sys.argv[1]) 10*af03003cSMatthias Ringwald 11*af03003cSMatthias Ringwaldfull_rows = size // 3 12*af03003cSMatthias Ringwaldinit_size = size % 3 13*af03003cSMatthias Ringwald 14*af03003cSMatthias Ringwaldif init_size == 0: 15*af03003cSMatthias Ringwald full_rows = full_rows - 1 16*af03003cSMatthias Ringwald init_size = 3 17*af03003cSMatthias Ringwald 18*af03003cSMatthias Ringwalddef emit(line, *args): 19*af03003cSMatthias Ringwald s = '"' + line + r' \n\t"' 20*af03003cSMatthias Ringwald print s % args 21*af03003cSMatthias Ringwald 22*af03003cSMatthias Ringwaldrx = [3, 4, 5] 23*af03003cSMatthias Ringwaldry = [6, 7, 8] 24*af03003cSMatthias Ringwald 25*af03003cSMatthias Ringwald#### set up registers 26*af03003cSMatthias Ringwaldemit("add r0, %s", (size - init_size) * 4) # move z 27*af03003cSMatthias Ringwaldemit("add r2, %s", (size - init_size) * 4) # move y 28*af03003cSMatthias Ringwald 29*af03003cSMatthias Ringwaldemit("ldmia r1!, {%s}", ", ".join(["r%s" % (rx[i]) for i in xrange(init_size)])) 30*af03003cSMatthias Ringwaldemit("ldmia r2!, {%s}", ", ".join(["r%s" % (ry[i]) for i in xrange(init_size)])) 31*af03003cSMatthias Ringwald 32*af03003cSMatthias Ringwaldprint "" 33*af03003cSMatthias Ringwaldif init_size == 1: 34*af03003cSMatthias Ringwald emit("umull r9, r10, r3, r6") 35*af03003cSMatthias Ringwald emit("stmia r0!, {r9, r10}") 36*af03003cSMatthias Ringwaldelse: 37*af03003cSMatthias Ringwald #### first two multiplications of initial block 38*af03003cSMatthias Ringwald emit("umull r11, r12, r3, r6") 39*af03003cSMatthias Ringwald emit("stmia r0!, {r11}") 40*af03003cSMatthias Ringwald print "" 41*af03003cSMatthias Ringwald emit("mov r10, #0") 42*af03003cSMatthias Ringwald emit("umull r11, r9, r3, r7") 43*af03003cSMatthias Ringwald emit("adds r12, r11") 44*af03003cSMatthias Ringwald emit("adc r9, #0") 45*af03003cSMatthias Ringwald emit("umull r11, r14, r4, r6") 46*af03003cSMatthias Ringwald emit("adds r12, r11") 47*af03003cSMatthias Ringwald emit("adcs r9, r14") 48*af03003cSMatthias Ringwald emit("adc r10, #0") 49*af03003cSMatthias Ringwald emit("stmia r0!, {r12}") 50*af03003cSMatthias Ringwald print "" 51*af03003cSMatthias Ringwald 52*af03003cSMatthias Ringwald #### rest of initial block, with moving accumulator registers 53*af03003cSMatthias Ringwald acc = [9, 10, 11, 12, 14] 54*af03003cSMatthias Ringwald if init_size == 3: 55*af03003cSMatthias Ringwald emit("mov r%s, #0", acc[2]) 56*af03003cSMatthias Ringwald for i in xrange(0, 3): 57*af03003cSMatthias Ringwald emit("umull r%s, r%s, r%s, r%s", acc[3], acc[4], rx[i], ry[2 - i]) 58*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 59*af03003cSMatthias Ringwald emit("adcs r%s, r%s", acc[1], acc[4]) 60*af03003cSMatthias Ringwald emit("adc r%s, #0", acc[2]) 61*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[0]) 62*af03003cSMatthias Ringwald print "" 63*af03003cSMatthias Ringwald acc = acc[1:] + acc[:1] 64*af03003cSMatthias Ringwald 65*af03003cSMatthias Ringwald emit("mov r%s, #0", acc[2]) 66*af03003cSMatthias Ringwald for i in xrange(0, 2): 67*af03003cSMatthias Ringwald emit("umull r%s, r%s, r%s, r%s", acc[3], acc[4], rx[i + 1], ry[2 - i]) 68*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 69*af03003cSMatthias Ringwald emit("adcs r%s, r%s", acc[1], acc[4]) 70*af03003cSMatthias Ringwald emit("adc r%s, #0", acc[2]) 71*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[0]) 72*af03003cSMatthias Ringwald print "" 73*af03003cSMatthias Ringwald acc = acc[1:] + acc[:1] 74*af03003cSMatthias Ringwald 75*af03003cSMatthias Ringwald emit("umull r%s, r%s, r%s, r%s", acc[3], acc[4], rx[init_size-1], ry[init_size-1]) 76*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 77*af03003cSMatthias Ringwald emit("adc r%s, r%s", acc[1], acc[4]) 78*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[0]) 79*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[1]) 80*af03003cSMatthias Ringwaldprint "" 81*af03003cSMatthias Ringwald 82*af03003cSMatthias Ringwald#### reset y and z pointers 83*af03003cSMatthias Ringwaldemit("sub r0, %s", (2 * init_size + 3) * 4) 84*af03003cSMatthias Ringwaldemit("sub r2, %s", (init_size + 3) * 4) 85*af03003cSMatthias Ringwald 86*af03003cSMatthias Ringwald#### load y registers 87*af03003cSMatthias Ringwaldemit("ldmia r2!, {%s}", ", ".join(["r%s" % (ry[i]) for i in xrange(3)])) 88*af03003cSMatthias Ringwald 89*af03003cSMatthias Ringwald#### load additional x registers 90*af03003cSMatthias Ringwaldif init_size != 3: 91*af03003cSMatthias Ringwald emit("ldmia r1!, {%s}", ", ".join(["r%s" % (rx[i]) for i in xrange(init_size, 3)])) 92*af03003cSMatthias Ringwaldprint "" 93*af03003cSMatthias Ringwald 94*af03003cSMatthias Ringwaldprev_size = init_size 95*af03003cSMatthias Ringwaldfor row in xrange(full_rows): 96*af03003cSMatthias Ringwald emit("umull r11, r12, r3, r6") 97*af03003cSMatthias Ringwald emit("stmia r0!, {r11}") 98*af03003cSMatthias Ringwald print "" 99*af03003cSMatthias Ringwald emit("mov r10, #0") 100*af03003cSMatthias Ringwald emit("umull r11, r9, r3, r7") 101*af03003cSMatthias Ringwald emit("adds r12, r11") 102*af03003cSMatthias Ringwald emit("adc r9, #0") 103*af03003cSMatthias Ringwald emit("umull r11, r14, r4, r6") 104*af03003cSMatthias Ringwald emit("adds r12, r11") 105*af03003cSMatthias Ringwald emit("adcs r9, r14") 106*af03003cSMatthias Ringwald emit("adc r10, #0") 107*af03003cSMatthias Ringwald emit("stmia r0!, {r12}") 108*af03003cSMatthias Ringwald print "" 109*af03003cSMatthias Ringwald 110*af03003cSMatthias Ringwald acc = [9, 10, 11, 12, 14] 111*af03003cSMatthias Ringwald emit("mov r%s, #0", acc[2]) 112*af03003cSMatthias Ringwald for i in xrange(0, 3): 113*af03003cSMatthias Ringwald emit("umull r%s, r%s, r%s, r%s", acc[3], acc[4], rx[i], ry[2 - i]) 114*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 115*af03003cSMatthias Ringwald emit("adcs r%s, r%s", acc[1], acc[4]) 116*af03003cSMatthias Ringwald emit("adc r%s, #0", acc[2]) 117*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[0]) 118*af03003cSMatthias Ringwald print "" 119*af03003cSMatthias Ringwald acc = acc[1:] + acc[:1] 120*af03003cSMatthias Ringwald 121*af03003cSMatthias Ringwald #### now we need to start shifting x and loading from z 122*af03003cSMatthias Ringwald x_regs = [3, 4, 5] 123*af03003cSMatthias Ringwald for r in xrange(0, prev_size): 124*af03003cSMatthias Ringwald x_regs = x_regs[1:] + x_regs[:1] 125*af03003cSMatthias Ringwald emit("ldmia r1!, {r%s}", x_regs[2]) 126*af03003cSMatthias Ringwald emit("mov r%s, #0", acc[2]) 127*af03003cSMatthias Ringwald for i in xrange(0, 3): 128*af03003cSMatthias Ringwald emit("umull r%s, r%s, r%s, r%s", acc[3], acc[4], x_regs[i], ry[2 - i]) 129*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 130*af03003cSMatthias Ringwald emit("adcs r%s, r%s", acc[1], acc[4]) 131*af03003cSMatthias Ringwald emit("adc r%s, #0", acc[2]) 132*af03003cSMatthias Ringwald emit("ldr r%s, [r0]", acc[3]) # load stored value from initial block, and add to accumulator 133*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 134*af03003cSMatthias Ringwald emit("adcs r%s, #0", acc[1]) 135*af03003cSMatthias Ringwald emit("adc r%s, #0", acc[2]) 136*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[0]) 137*af03003cSMatthias Ringwald print "" 138*af03003cSMatthias Ringwald acc = acc[1:] + acc[:1] 139*af03003cSMatthias Ringwald 140*af03003cSMatthias Ringwald # done shifting x, start shifting y 141*af03003cSMatthias Ringwald y_regs = [6, 7, 8] 142*af03003cSMatthias Ringwald for r in xrange(0, prev_size): 143*af03003cSMatthias Ringwald y_regs = y_regs[1:] + y_regs[:1] 144*af03003cSMatthias Ringwald emit("ldmia r2!, {r%s}", y_regs[2]) 145*af03003cSMatthias Ringwald emit("mov r%s, #0", acc[2]) 146*af03003cSMatthias Ringwald for i in xrange(0, 3): 147*af03003cSMatthias Ringwald emit("umull r%s, r%s, r%s, r%s", acc[3], acc[4], x_regs[i], y_regs[2 - i]) 148*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 149*af03003cSMatthias Ringwald emit("adcs r%s, r%s", acc[1], acc[4]) 150*af03003cSMatthias Ringwald emit("adc r%s, #0", acc[2]) 151*af03003cSMatthias Ringwald emit("ldr r%s, [r0]", acc[3]) # load stored value from initial block, and add to accumulator 152*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 153*af03003cSMatthias Ringwald emit("adcs r%s, #0", acc[1]) 154*af03003cSMatthias Ringwald emit("adc r%s, #0", acc[2]) 155*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[0]) 156*af03003cSMatthias Ringwald print "" 157*af03003cSMatthias Ringwald acc = acc[1:] + acc[:1] 158*af03003cSMatthias Ringwald 159*af03003cSMatthias Ringwald # done both shifts, do remaining corner 160*af03003cSMatthias Ringwald emit("mov r%s, #0", acc[2]) 161*af03003cSMatthias Ringwald for i in xrange(0, 2): 162*af03003cSMatthias Ringwald emit("umull r%s, r%s, r%s, r%s", acc[3], acc[4], x_regs[i + 1], y_regs[2 - i]) 163*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 164*af03003cSMatthias Ringwald emit("adcs r%s, r%s", acc[1], acc[4]) 165*af03003cSMatthias Ringwald emit("adc r%s, #0", acc[2]) 166*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[0]) 167*af03003cSMatthias Ringwald print "" 168*af03003cSMatthias Ringwald acc = acc[1:] + acc[:1] 169*af03003cSMatthias Ringwald 170*af03003cSMatthias Ringwald emit("umull r%s, r%s, r%s, r%s", acc[3], acc[4], x_regs[2], y_regs[2]) 171*af03003cSMatthias Ringwald emit("adds r%s, r%s", acc[0], acc[3]) 172*af03003cSMatthias Ringwald emit("adc r%s, r%s", acc[1], acc[4]) 173*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[0]) 174*af03003cSMatthias Ringwald emit("stmia r0!, {r%s}", acc[1]) 175*af03003cSMatthias Ringwald print "" 176*af03003cSMatthias Ringwald 177*af03003cSMatthias Ringwald prev_size = prev_size + 3 178*af03003cSMatthias Ringwald if row < full_rows - 1: 179*af03003cSMatthias Ringwald #### reset x, y and z pointers 180*af03003cSMatthias Ringwald emit("sub r0, %s", (2 * prev_size + 3) * 4) 181*af03003cSMatthias Ringwald emit("sub r1, %s", prev_size * 4) 182*af03003cSMatthias Ringwald emit("sub r2, %s", (prev_size + 3) * 4) 183*af03003cSMatthias Ringwald 184*af03003cSMatthias Ringwald #### load x and y registers 185*af03003cSMatthias Ringwald emit("ldmia r1!, {%s}", ",".join(["r%s" % (rx[i]) for i in xrange(3)])) 186*af03003cSMatthias Ringwald emit("ldmia r2!, {%s}", ",".join(["r%s" % (ry[i]) for i in xrange(3)])) 187*af03003cSMatthias Ringwald 188*af03003cSMatthias Ringwald print "" 189