Listing 1 Simple for loop in C 1 void example1(float *out, float *input1, float *input2) 2 { 3 int i; 4 5 for(i = 0; i < 100; i++) 6 { 7 out[i] = input1[i] * input2[i]; 8 } 9 } Listing 2 Serial assembly language implementation of C loop 1 ; 2 ; serial implementation of loop (26 cycles per iteration) 3 ; 4 L1: LDW *B++,B5 ;load B[i] into B5 5 NOP 4 ; wait for load to complete 6 7 LDW *A++,A4 ; load A[i] into A4 8 NOP 4 ; wait for load to complete 9 10 MPYSP B5,A4,A4 ; A4 = A4 * B5 11 NOP 3 ; wait for mult to complete 12 13 STW A4,*C++ ; store A4 in C[i] 14 NOP 4 ; wait got store to complete 15 16 SUB i,1,i ; decrement i 17 [i] B L1 ; if i != 0, goto L1 18 NOP 5 ; delay for branch Listing 3 A more parallel implementation of the C loop 1 ; using delay slots and duplicate execution units of the device 2 ; 10 cycles per iteration 3 4 L1: LDW .D2 *B++,B5 ;load B[i] into B5 5 || LDW .D1 *A++,A4 ;load A[i] into A4 6 7 NOP 2 ; wait load to complete 8 SUB .L2 i,1,i ;decrement i 9 [i] B .S1 L1 ; if i != 0, goto L1 10 11 MPYSP .M1X B5,A4,A4 ; A4 = A4 * B5 12 NOP 3 ; wait mpy to complete 13 14 STW .D1 A4,*C++ ;store A4 into C[i] Listing 4 Loop unrolling: a) A simple loop; b) The same loop unrolled four times a) for (i=0; i<128; i++) { sum1 += const[i] * input[128 - i]; } b) for (i=0; i<128; i++) { sum1 += const[i] * input[128 - i]; sum2 += const[2*i] * input[128 - (2*i)]; sum3 += const[3*i] * input[128 - (3*i)]; sum4 += const[4*i] * input[128 - (4*i)]; } Listing 5 C example and the corresponding pipelined assembly language output 1 void example1(float *out, float *input1, float *input2) 2 { 3 int i; 4 5 for(i = 0; i < 100; i++) 6 { 7 out[i] = input1[i] * input2[i]; 8 } 9 } 1 _example1: 2 ;** ---------------------------------------------------------* 3 MVK .S2 0x64,B0 4 5 MVC .S2 CSR,B6 6 || MV .L1X B4,A3 7 || MV .L2X A6,B5 8 AND .L1X -2,B6,A0 9 MVC .S2X A0,CSR 10 ;** ---------------------------------------------------------* 11 L11: ; PIPED LOOP PROLOG 12 ;** ---------------------------------------------------------* 13 L12: ; PIPED LOOP KERNEL 14 LDW .D2 *B5++,B4 ; 15 || LDW .D1 *A3++,A0 ; 16 NOP 2 17 [ B0] SUB .L2 B0,1,B0 ; 18 [ B0] B .S2 L12 ; 19 MPYSP .M1X B4,A0,A0 ; 20 NOP 3 21 STW .D1 A0,*A4++ ; 22 ;** --------------------------------------------------------* 23 MVC .S2 B6,CSR 24 B .S2 B3 25 NOP 5 26 ; BRANCH OCCURS Listing 6. Corresponding pipelined assembly language output 1 void example2(float *out, const float *input1, const float *input2) 2 { 3 int i; 4 5 for(i = 0; i < 100; i++) 6 { 7 out[i] = input1[i] * input2[i]; 8 } 9 } 1 _example2: 2 ;** ---------------------------------------------------------------* 3 MVK .S2 0x64,B0 4 5 MVC .S2 CSR,B6 6 || MV .L1X B4,A3 7 || MV .L2X A6,B5 8 9 AND .L1X -2,B6,A0 10 11 MVC .S2X A0,CSR 12 || SUB .L2 B0,4,B0 13 14 ;** --------------------------------------------------------------* 15 L8: ; PIPED LOOP PROLOG 16 17 LDW .D2 *B5++,B4 ; 18 || LDW .D1 *A3++,A0 ; 19 20 NOP 1 21 22 LDW .D2 *B5++,B4 ;@ 23 || LDW .D1 *A3++,A0 ;@ 24 25 [ B0] SUB .L2 B0,1,B0 ; 26 27 [ B0] B .S2 L9 ; 28 || LDW .D2 *B5++,B4 ;@@ 29 || LDW .D1 *A3++,A0 ;@@ 30 31 MPYSP .M1X B4,A0,A5 ; 32 || [ B0] SUB .L2 B0,1,B0 ;@ 33 34 [ B0] B .S2 L9 ;@ 35 || LDW .D2 *B5++,B4 ;@@@ 36 || LDW .D1 *A3++,A0 ;@@@ 37 38 MPYSP .M1X B4,A0,A5 ;@ 39 || [ B0] SUB .L2 B0,1,B0 ;@@ 40 41 ;** --------------------------------------------------------------* 42 L9: ; PIPED LOOP KERNEL 43 44 [ B0] B .S2 L9 ;@@ 45 || LDW .D2 *B5++,B4 ;@@@@ 46 || LDW .D1 *A3++,A0 ;@@@@ 47 48 STW .D1 A5,*A4++ ; 49 || MPYSP .M1X B4,A0,A5 ;@@ 50 || [ B0] SUB .L2 B0,1,B0 ;@@@ 51 52 ;** --------------------------------------------------------------* 53 L10: ; PIPED LOOP EPILOG 54 NOP 1 55 56 STW .D1 A5,*A4++ ;@ 57 || MPYSP .M1X B4,A0,A5 ;@@@ 58 59 NOP 1 60 61 STW .D1 A5,*A4++ ;@@ 62 || MPYSP .M1X B4,A0,A5 ;@@@@ 64 NOP 1 65 STW .D1 A5,*A4++ ;@@@ 66 NOP 1 67 STW .D1 A5,*A4++ ;@@@@ 68 ;** --------------------------------------------------------------* 69 MVC .S2 B6,CSR 70 B .S2 B3 71 NOP 5 72 ; BRANCH OCCURS