@******************************************************************************
@ Assembly functions for test the cycles of read/write operations
@ Created by Brighton Feng for Cortex A15 on July 31, 2014
@******************************************************************************
 	.global Asm_LDRD_Test
 	.global Asm_STRD_Test
 	.global AsmMemCopy

	.text
	@ This code is assembled for ARM instructions
	.code 32

@Asm_LDRD_Test(buff_addr, uiIndex, uiCount)
    .type    Asm_LDRD_Test, %function
Asm_LDRD_Test:
    PUSH    {r4-r7}
	ADD     r3, r1, r1		@uiIndex*2
	ADD     r4, r3, r1 		@uiIndex*3
	ADD     r5, r4, r1 		@uiIndex*4
LoadTestLoop:
	SUBS 	r2, r2, #4
	LDRD 	r6, r7, [r0, #+0]
	LDRD 	r6, r7, [r0, +r1]
	LDRD 	r6, r7, [r0, +r3]
	LDRD 	r6, r7, [r0, +r4]
	ADD     r0, r0, r5
	BGT 	LoadTestLoop
 		
	DSB

    POP     {r4-r7}
    BX      lr
/*    
    PUSH    {r4-r9}
	ADD     r3, r1, r1		@uiIndex*2
	ADD     r4, r3, r1 		@uiIndex*3
	ADD     r5, r4, r1 		@uiIndex*4
	MOV     r8, #0
	MOV     r9, #0
LoadTestLoop:
	SUBS 	r2, r2, #4
	LDRD 	r6, r7, [r0, #+0]
	ADD     r8, r8, r6
	LDRD 	r6, r7, [r0, +r1]
	ADD     r9, r9, r6
	LDRD 	r6, r7, [r0, +r3]
	ADD     r8, r8, r6
	LDRD 	r6, r7, [r0, +r4]
	ADD     r9, r9, r6
	ADD     r0, r0, r5
	BGT 	LoadTestLoop
 		
	DSB

	ADD     r0, r8, r9
	
    POP     {r4-r9}
    BX      lr
*/
@Asm_STRD_Test(buff_addr, uiIndex, uiCount)
    .type    Asm_STRD_Test, %function
Asm_STRD_Test:
    PUSH    {r4-r7}
    MOV     r6, #0xBF
    MOV     r7, #0xBF
	ADD     r3, r1, r1		@uiIndex*2
	ADD     r4, r3, r1 		@uiIndex*3
	ADD     r5, r4, r1 		@uiIndex*4
StoreTestLoop:
	SUBS 	r2, r2, #4
	STRD 	r6, r7, [r0, #+0]
	STRD 	r6, r7, [r0, +r1]
	STRD 	r6, r7, [r0, +r3]
	STRD 	r6, r7, [r0, +r4]
	ADD     r0, r0, r5
	BGT 	StoreTestLoop

	DSB
 		
    POP     {r4-r7}
    BX      lr

@Copy multiple of 8 bytes data to show the max throughput of data transfer by CPU
@void AsmMemCopy(unsigned long long * restrict dst, unsigned long long * restrict src, Uint32 uiCount)
    .type    AsmMemCopy, %function
AsmMemCopy:
	PUSH    {r4-r5}
Loop_start: 
	PLD   [r1,#256] 
	SUBS  r2,r2,#64 
	LDRD  r4,r5,[r1,#0]  
	STRD  r4,r5,[r0,#0]  
	LDRD  r4,r5,[r1,#8]  
	STRD  r4,r5,[r0,#8]  
	LDRD  r4,r5,[r1,#16] 
	STRD  r4,r5,[r0,#16] 
	LDRD  r4,r5,[r1,#24] 
	STRD  r4,r5,[r0,#24] 
	LDRD  r4,r5,[r1,#32] 
	STRD  r4,r5,[r0,#32] 
	LDRD  r4,r5,[r1,#40] 
	STRD  r4,r5,[r0,#40] 
	LDRD  r4,r5,[r1,#48] 
	STRD  r4,r5,[r0,#48] 
	LDRD  r4,r5,[r1,#56] 
	STRD  r4,r5,[r0,#56] 
	ADD   r1,r1,#64 
	ADD   r0,r0,#64 
	BGT   Loop_start  		

    POP     {r4-r5}

    BX      lr
/*
  MOVS            R2, R2, LSR #4   
  BXEQ            R14              
  ADD             R1, R1, #128     
  ADD             R0, R0, #128     
  MOV             R3, #0           
MemCopySIMDloop:
  ADD             R3, R3, #1       
  VLDR            D16, [R1, #-112] 
  VLDR            D17, [R1, #-104] 
  VSTR            D16, [R0, #-112] 
  VSTR            D17, [R0, #-104] 
  CMP             R3, R2           
  VLDR            D16, [R1, #-128] 
  VLDR            D17, [R1, #-120] 
  VSTR            D16, [R0, #-128] 
  VSTR            D17, [R0, #-120] 
  VLDR            D16, [R1, #-96]  
  VLDR            D17, [R1, #-88]  
  VSTR            D16, [R0, #-96]  
  VSTR            D17, [R0, #-88]  
  VLDR            D16, [R1, #-80]  
  VLDR            D17, [R1, #-72]  
  VSTR            D16, [R0, #-80]  
  VSTR            D17, [R0, #-72]  
  VLDR            D16, [R1, #-64]  
  VLDR            D17, [R1, #-56]  
  VSTR            D16, [R0, #-64]  
  VSTR            D17, [R0, #-56]  
  VLDR            D16, [R1, #-48]  
  VLDR            D17, [R1, #-40]  
  VSTR            D16, [R0, #-48]  
  VSTR            D17, [R0, #-40]  
  VLDR            D16, [R1, #-32]  
  VLDR            D17, [R1, #-24]  
  VSTR            D16, [R0, #-32]  
  VSTR            D17, [R0, #-24]  
  VLDR            D16, [R1, #-16]  
  VLDR            D17, [R1, #-8]   
  ADD             R1, R1, #128     
  VSTR            D16, [R0, #-16]  
  VSTR            D17, [R0, #-8]   
  ADD             R0, R0, #128     
  BNE             MemCopySIMDloop        
  BX              R14              
*/	
