Figure 4-25. Nested Loops: Touch Instruction Example
|
|
Fortran Source Code
|
|
|
program main
|
|
|
|
|
|
|
|
integer i,j,n
|
|
|
real*4 a(1000,1000), b(1000,1000)
|
|
|
real*4 c(1000,1000)
|
|
|
|
|
|
|
|
n=1000
|
|
|
do 20 j=1,n
|
|
|
do 10 i = 1,n
|
|
|
c(i,j) = c(i,j+1) + a(i,j+2) + b(i,j+3)
|
|
10
|
continue
|
|
20
|
continue
|
|
|
end
|
|
|
|
|
|
|
|
Assembly Code for Innermost Loop Body
|
|
|
# index for touching a is (R31) = 100
|
|
|
# index for touching b is (R4) = 100
|
|
|
# index for touching the next column in c is (R3) = 4100
|
|
CL.2:
|
|
|
|
|
|
lfsu
|
FR2,4(R5)
|
# load b(i,j+3) copy 1
|
|
|
fadds
|
FR0,FR1,FR0
|
# tmp = c(i,j+1) + a(i,j+2) copy 1
|
|
|
lfsu
|
FR3,4(R29)
|
# load a(i,j+2) copy 2
|
|
|
lfs
|
FR4,4008(R30)
|
# load c(i,j+1) copy 2
|
|
|
lfsu
|
FR5,4(R5)
|
# load b(i,j+3) copy 2
|
|
|
fadds
|
FR0,FR0,FR2
|
# c(i,j) = tmp + b(i,j+3), copy 1
|
|
|
stfsu
|
FR0,4(R30)
|
# store c(i,j) copy 1
|
|
|
dcbt
|
R30,R3
|
# touch c
|
|
|
fadds
|
FR1,FR4,FR3
|
# tmp = c(i,j+1) + a(i,j+2) copy 2
|
|
|
lfs
|
FR6,4008(R30)
|
# load c(i,j+1) copy 3
|
|
|
lfsu
|
FR2,4(R29)
|
# load a(i,j+2) copy 3
|
|
|
lfsu
|
FR4,4(R29)
|
# load a(i,j+2) copy 4
|
|
|
fadds
|
FR0,FR1,FR5
|
# c(i,j) = tmp + b(i,j+3) copy 2
|
|
|
stfsu
|
FR0,4(R30)
|
# store c(i,j) copy 2
|
|
|
lfs
|
FR3,4008(R30)
|
# load c(i,j+1) copy 4
|
|
|
lfsu
|
FR0,4(R5)
|
# load b(i,j+3) copy 3
|
|
|
fadds
|
FR1,FR6,FR2
|
# tmp = c(i,j+1) + a(i,j+2) copy 3
|
|
|
lfsu
|
FR5,4(R5)
|
# load b(i,j+3) copy 4
|
|
|
lfsu
|
FR2,4(R29)
|
# load a(i,j+2) copy 5
|
|
|
fadds
|
FR0,FR1,FR0
|
# c(i,j) = tmp + b(i,j+3) copy 3
|
|
|
stfsu
|
FR0,4(R30)
|
# store c(i,j) copy 3
|
|
|
lfs
|
FR6,4008(R30)
|
# load c(i,j+1) copy 5
|
|
|
fadds
|
FR1,FR3,FR4
|
# tmp = c(i,j+1) + a(i,j+2) copy 4
|
|
|
lfsu
|
FR4,4(R29)
|
# load a(i,j+2) copy 6
|
|
|
fadds
|
FR0,FR1,FR5
|
# c(i,j) = tmp + b(i,j+3) copy 4
|
|
|
stfsu
|
FR0,4(R30)
|
# store c(i,j) copy 4
|
|
|
lfs
|
FR3,4008(R30)
|
# load c(i,j+1) copy 6
|
|
|
lfsu
|
FR0,4(R5)
|
# load b(i,j+3) copy 5
|
|
|
fadds
|
FR1,FR6,FR2
|
# tmp = c(i,j+1) + a(i,j+2) copy 5
|
|
|
dcbt
|
R5,R4
|
# touch b
|
|
|
lfsu
|
FR5,4(R5)
|
# load b(i,j+3) copy 6
|
|
|
lfsu
|
FR2,4(R29)
|
# load a(i,j+2) copy 7
|
|
|
fadds
|
FR0,FR1,FR0
|
# c(i,j) = tmp + b(i,j+3) copy 5
|
|
|
stfsu
|
FR0,4(R30)
|
# store c(i,j) copy 5
|
|
|
lfs
|
FR6,4008(R30)
|
# load c(i,j+1) copy 7
|
|
|
fadds
|
FR1,FR3,FR4
|
# tmp = c(i,j+1) + a(i,j+2) copy 6
|
|
|
lfsu
|
FR4,4(R29)
|
# load a(i,j+2) copy 8
|
|
|
fadds
|
FR0,FR1,FR5
|
# c(i,j) = tmp + b(i,j+3) copy 6
|
|
|
stfsu
|
FR0,4(R30)
|
# store c(i,j) copy 6
|
|
|
lfs
|
FR3,4008(R30)
|
# load c(i,j+1) copy 8
|
|
|
dcbt
|
R29,R31
|
# touch a
|
|
|
lfsu
|
FR0,4(R5)
|
# load b(i,j+3) copy 7
|
|
|
fadds
|
FR1,FR6,FR2
|
# tmp = c(i,j+1) + a(i,j+2) copy 7
|
|
|
lfsu
|
FR5,4(R5)
|
# load b(i,j+3) copy 8
|
|
|
fadds
|
FR2,FR3,FR4
|
# tmp = c(i,j+1) + a(i,j+2) copy 8
|
|
|
fadds
|
FR1,FR1,FR0
|
# c(i,j) = tmp + b(i,j+3) copy 7
|
|
|
stfsu
|
FR1,4(R30)
|
# store c(i,j) copy 7
|
|
|
fadds
|
FR2,FR2,FR5
|
# c(i,j) = tmp + b(i,j+3) copy 8
|
|
|
lfsu
|
FR0,4(R29)
|
# load a(i,j+2) copy 1
|
|
|
lfs
|
FR1,4008(R30)
|
# load c(i,j+1) copy 1
|
|
|
stfsu
|
FR2,4(R30)
|
# store c(i,j) copy 8
|
|
|
bdnz
|
CL.2
|
# latch to CL.2
|
|
CL.40:
|
|
|
|