cpu: more instruction pipelining

This is needed to make WNS room for fetching instructions from SRAM.
This commit is contained in:
Paul Mathieu 2021-04-17 23:02:58 -07:00
parent 6825ce464f
commit 36bc1417b6
3 changed files with 99 additions and 92 deletions

View File

@ -49,7 +49,7 @@ architecture behavior of cpu is
signal load_reg_next, load_reg: std_logic_vector(15 downto 0); signal load_reg_next, load_reg: std_logic_vector(15 downto 0);
signal load_addr_next, load_addr: std_logic_vector(15 downto 0); signal load_addr_next, load_addr: std_logic_vector(15 downto 0);
signal hold_inst_next, hold_inst: std_logic_vector(15 downto 0); signal inst_next, inst: std_logic_vector(15 downto 0);
type regbank is array(0 to 15) of std_logic_vector(15 downto 0); type regbank is array(0 to 15) of std_logic_vector(15 downto 0);
signal reg_d: regbank; signal reg_d: regbank;
@ -62,7 +62,6 @@ begin
load_reg_r: reg port map(clk => clk, rst => rst, d => load_reg_next, q => load_reg); load_reg_r: reg port map(clk => clk, rst => rst, d => load_reg_next, q => load_reg);
load_addr_r: reg port map(clk => clk, rst => rst, d => load_addr_next, q => load_addr); load_addr_r: reg port map(clk => clk, rst => rst, d => load_addr_next, q => load_addr);
hold_inst_r: reg port map(clk => clk, rst => rst, d => hold_inst_next, q => hold_inst);
allregs: allregs:
for i in 0 to 15 generate for i in 0 to 15 generate
@ -73,15 +72,16 @@ begin
begin begin
if rst = '1' then if rst = '1' then
cpu_state <= BRANCH; -- wait a cycle at first cpu_state <= BRANCH; -- wait a cycle at first
inst <= x"0000";
elsif rising_edge(clk) then elsif rising_edge(clk) then
cpu_state <= cpu_state_next; cpu_state <= cpu_state_next;
inst <= inst_next;
end if; end if;
end process; end process;
code_addr <= reg_q(14); code_addr <= reg_q(14);
process(code_data, reg_q, mem_in, mem_busy, alu_q, alu_flag, cpu_state, load_addr, load_reg, hold_inst) is process(code_data, reg_q, mem_in, mem_busy, alu_q, alu_flag, cpu_state, load_addr, load_reg, inst) is
variable inst: std_logic_vector(15 downto 0);
variable regn_0: natural; variable regn_0: natural;
variable regn_1: natural; variable regn_1: natural;
variable regn_2: natural; variable regn_2: natural;
@ -110,9 +110,9 @@ begin
case cpu_state is case cpu_state is
when RUN => when RUN =>
reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2); reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2);
inst := code_data; inst_next <= code_data;
when LOAD => when LOAD =>
inst := x"0000"; -- NOP inst_next <= inst;
mem_addr <= load_addr; -- maintain this until we're done reading mem_addr <= load_addr; -- maintain this until we're done reading
if load_reg(3 downto 0) = x"e" then if load_reg(3 downto 0) = x"e" then
cpu_state_next <= BRANCH; cpu_state_next <= BRANCH;
@ -127,16 +127,18 @@ begin
reg_d(regn_0) <= mem_in; reg_d(regn_0) <= mem_in;
end if; end if;
when BRANCH => when BRANCH =>
inst := x"0000"; -- NOP inst_next <= x"0000"; -- NOP
reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2); reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2);
when WAIT_MEM => when WAIT_MEM =>
cpu_state_next <= RUN; inst_next <= inst;
inst := hold_inst;
reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2); reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2);
if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher
cpu_state_next <= WAIT_MEM;
end if;
end case; end case;
hold_inst_next <= inst; if cpu_state = RUN then
regn_0 := to_integer(unsigned(inst(11 downto 8))); regn_0 := to_integer(unsigned(inst(11 downto 8)));
regn_1 := to_integer(unsigned(inst(7 downto 4))); regn_1 := to_integer(unsigned(inst(7 downto 4)));
regn_2 := to_integer(unsigned(inst(3 downto 0))); regn_2 := to_integer(unsigned(inst(3 downto 0)));
@ -146,11 +148,13 @@ begin
when "0001" => -- LOAD rn, [rm, imm] (imm is signed 4 bits) when "0001" => -- LOAD rn, [rm, imm] (imm is signed 4 bits)
if mem_busy = '1' then if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher reg_d(14) <= reg_q(14); -- halt the prefetcher
inst_next <= inst;
cpu_state_next <= WAIT_MEM; cpu_state_next <= WAIT_MEM;
else else
mem_read <= '1'; mem_read <= '1';
cpu_state_next <= LOAD; cpu_state_next <= LOAD;
reg_d(14) <= reg_q(14); -- halt the prefetcher reg_d(14) <= reg_q(14); -- halt the prefetcher
-- inst_next <= inst;
load_addr_next <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0')); load_addr_next <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0')); mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
@ -160,6 +164,7 @@ begin
when "0010" => -- STORE rn, [rm, imm] when "0010" => -- STORE rn, [rm, imm]
if mem_busy = '1' then if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher reg_d(14) <= reg_q(14); -- halt the prefetcher
inst_next <= inst;
cpu_state_next <= WAIT_MEM; cpu_state_next <= WAIT_MEM;
else else
mem_write <= '1'; mem_write <= '1';
@ -216,6 +221,7 @@ begin
cpu_state_next <= BRANCH; cpu_state_next <= BRANCH;
end if; end if;
end if; end if;
end if;
end process; end process;
end behavior; end behavior;

View File

@ -92,7 +92,7 @@ def generate_ops(ops, labels, relocs):
if isinstance(p, str): # label ref if isinstance(p, str): # label ref
if len(params) == 1: # branch if len(params) == 1: # branch
yield 14 # pc yield 14 # pc
yield labels[p] - pc - 2 yield labels[p] - pc - 4
else: # set, allow relocs here else: # set, allow relocs here
relocs.append((pc, p)) relocs.append((pc, p))
yield 0xff yield 0xff

View File

@ -473,10 +473,10 @@ class ShlOp(BinOp):
return [f'set {sc1}, 1', return [f'set {sc1}, 1',
f'or {self.dest}, {self.left}, {self.left}', f'or {self.dest}, {self.left}, {self.left}',
f'sub {sc0}, {self.right}, {sc1}', f'sub {sc0}, {self.right}, {sc1}',
f'beq [pc, 6]', f'beq [pc, 4]',
f'add {self.dest}, {self.dest}, {self.dest}', f'add {self.dest}, {self.dest}, {self.dest}',
f'sub {sc0}, {sc0}, {sc1}', f'sub {sc0}, {sc0}, {sc1}',
f'bneq [pc, -6]'] f'bneq [pc, -8]']
class LtOp(BinOp): class LtOp(BinOp):
@ -485,7 +485,7 @@ class LtOp(BinOp):
sc0 = scratches[0] sc0 = scratches[0]
return [f'set {self.dest}, 0', return [f'set {self.dest}, 0',
f'sub {sc0}, {self.left}, {self.right}', f'sub {sc0}, {self.left}, {self.right}',
f'bneq [pc, 2]', f'bneq [pc, 0]',
f'set {self.dest}, 1'] f'set {self.dest}, 1']
class GtOp(LtOp): class GtOp(LtOp):
@ -531,7 +531,7 @@ class BoolNot(UnOp):
def synth(self, scratches): def synth(self, scratches):
return [f'set {self.dest}, 0', return [f'set {self.dest}, 0',
f'cmp {self.dest}, {self.operand}', f'cmp {self.dest}, {self.operand}',
f'bneq [pc, 2]', f'bneq [pc, 0]',
f'set {self.dest}, 1'] f'set {self.dest}, 1']
class NeqOp(BinOp): class NeqOp(BinOp):
@ -555,7 +555,7 @@ class FnCall(AsmOp):
sc0 = scratches[0] sc0 = scratches[0]
fn = self.dest_fn fn = self.dest_fn
return out + [f'set {sc0}, 2', return out + [f'set {sc0}, 0',
f'add lr, pc, {sc0}', f'add lr, pc, {sc0}',
f'or pc, {fn}, {fn}'] f'or pc, {fn}, {fn}']
@ -1375,10 +1375,11 @@ preamble = [f'_start:',
f'set sp, 0', f'set sp, 0',
f'seth sp, {0x11}', # 256 bytes of stack ought to be enough f'seth sp, {0x11}', # 256 bytes of stack ought to be enough
f'set r2, main', f'set r2, main',
f'set r3, 2', f'set r3, 0',
f'add lr, pc, r3', f'add lr, pc, r3',
f'or pc, r2, r2', f'or pc, r2, r2',
f'or pc, pc, pc // loop forever', f'cmp r0, r0',
f'beq [pc, -4] // loop forever',
] ]
def filter_dupes(ops): def filter_dupes(ops):