cpu: more instruction pipelining

This is needed to make WNS room for fetching instructions from SRAM.
This commit is contained in:
Paul Mathieu 2021-04-17 23:02:58 -07:00
parent 6825ce464f
commit 36bc1417b6
3 changed files with 99 additions and 92 deletions

View File

@ -49,7 +49,7 @@ architecture behavior of cpu is
signal load_reg_next, load_reg: std_logic_vector(15 downto 0);
signal load_addr_next, load_addr: std_logic_vector(15 downto 0);
signal hold_inst_next, hold_inst: std_logic_vector(15 downto 0);
signal inst_next, inst: std_logic_vector(15 downto 0);
type regbank is array(0 to 15) of std_logic_vector(15 downto 0);
signal reg_d: regbank;
@ -62,7 +62,6 @@ begin
load_reg_r: reg port map(clk => clk, rst => rst, d => load_reg_next, q => load_reg);
load_addr_r: reg port map(clk => clk, rst => rst, d => load_addr_next, q => load_addr);
hold_inst_r: reg port map(clk => clk, rst => rst, d => hold_inst_next, q => hold_inst);
allregs:
for i in 0 to 15 generate
@ -73,15 +72,16 @@ begin
begin
if rst = '1' then
cpu_state <= BRANCH; -- wait a cycle at first
inst <= x"0000";
elsif rising_edge(clk) then
cpu_state <= cpu_state_next;
inst <= inst_next;
end if;
end process;
code_addr <= reg_q(14);
process(code_data, reg_q, mem_in, mem_busy, alu_q, alu_flag, cpu_state, load_addr, load_reg, hold_inst) is
variable inst: std_logic_vector(15 downto 0);
process(code_data, reg_q, mem_in, mem_busy, alu_q, alu_flag, cpu_state, load_addr, load_reg, inst) is
variable regn_0: natural;
variable regn_1: natural;
variable regn_2: natural;
@ -110,9 +110,9 @@ begin
case cpu_state is
when RUN =>
reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2);
inst := code_data;
inst_next <= code_data;
when LOAD =>
inst := x"0000"; -- NOP
inst_next <= inst;
mem_addr <= load_addr; -- maintain this until we're done reading
if load_reg(3 downto 0) = x"e" then
cpu_state_next <= BRANCH;
@ -127,93 +127,99 @@ begin
reg_d(regn_0) <= mem_in;
end if;
when BRANCH =>
inst := x"0000"; -- NOP
inst_next <= x"0000"; -- NOP
reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2);
when WAIT_MEM =>
cpu_state_next <= RUN;
inst := hold_inst;
inst_next <= inst;
reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2);
end case;
hold_inst_next <= inst;
regn_0 := to_integer(unsigned(inst(11 downto 8)));
regn_1 := to_integer(unsigned(inst(7 downto 4)));
regn_2 := to_integer(unsigned(inst(3 downto 0)));
case inst(15 downto 12) is
when "0000" => -- NOP
when "0001" => -- LOAD rn, [rm, imm] (imm is signed 4 bits)
if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher
cpu_state_next <= WAIT_MEM;
else
mem_read <= '1';
cpu_state_next <= LOAD;
reg_d(14) <= reg_q(14); -- halt the prefetcher
load_addr_next <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
load_reg_next(3 downto 0) <= inst(11 downto 8);
end if;
when "0010" => -- STORE rn, [rm, imm]
if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher
cpu_state_next <= WAIT_MEM;
else
mem_write <= '1';
mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
mem_out <= reg_q(regn_0);
end if;
--- ALU stuff
when "0011" => do_alu := '1'; -- ADD rd, rn, rm (rd := rn + rm)
when "0100" => do_alu := '1'; -- SUB rd, rn, rm (rd := rn - rm)
when "0101" => do_alu := '1'; -- OR rd, rn, rm (rd := rn or rm)
when "0110" => do_alu := '1'; -- AND rd, rn, rm (rd := rn and rm)
when "0111" => do_alu := '1'; -- NOT rd, rn (rd := not rn)
when "1000" => do_alu := '1'; -- XOR rd, rn, rm (rd := rn xor rm)
when "1001" => -- SETH rd, imm
reg_d(regn_0)(15 downto 8) <= inst(7 downto 0);
when "1010" => -- SHR rd, rn, imm (rd := rn >> imm)
alu_sel <= inst(15 downto 12);
alu_a <= reg_q(regn_1);
alu_b <= x"000" & inst(3 downto 0);
reg_d(regn_0) <= alu_q;
when "1011" => do_alu := '1'; -- MUL rd, rn, rm (rd := rn * rm)
when "1100" => -- CMP rn, rm (flag := 1 if equal)
alu_sel <= "1100";
alu_a <= reg_q(regn_0);
alu_b <= reg_q(regn_1);
reg_d(15)(0) <= alu_flag;
when "1101" => -- BEQ imm (jump to [pc, imm] if flag is set, imm is signed 12 bits)
if reg_q(15)(0) = '1' then
reg_d(14) <= std_logic_vector(signed(reg_q(14)) + signed(inst(11 downto 0) & '0'));
cpu_state_next <= BRANCH;
end if;
when "1110" => -- SET rd, imm (rd := imm, imm is 8 bit)
reg_d(regn_0) <= x"00" & inst(7 downto 0);
when "1111" => -- BNEQ imm
if reg_q(15)(0) = '0' then
reg_d(14) <= std_logic_vector(signed(reg_q(14)) + signed(inst(11 downto 0) & '0'));
cpu_state_next <= BRANCH;
end if;
when others => -- do nothing
end case;
if do_alu = '1' then
-- 1:1 mapping
alu_sel <= inst(15 downto 12);
alu_a <= reg_q(regn_1);
alu_b <= reg_q(regn_2);
reg_d(regn_0) <= alu_q;
reg_d(15)(0) <= alu_flag;
if inst(11 downto 8) = x"e" then
cpu_state_next <= BRANCH;
if cpu_state = RUN then
regn_0 := to_integer(unsigned(inst(11 downto 8)));
regn_1 := to_integer(unsigned(inst(7 downto 4)));
regn_2 := to_integer(unsigned(inst(3 downto 0)));
case inst(15 downto 12) is
when "0000" => -- NOP
when "0001" => -- LOAD rn, [rm, imm] (imm is signed 4 bits)
if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher
inst_next <= inst;
cpu_state_next <= WAIT_MEM;
else
mem_read <= '1';
cpu_state_next <= LOAD;
reg_d(14) <= reg_q(14); -- halt the prefetcher
-- inst_next <= inst;
load_addr_next <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
load_reg_next(3 downto 0) <= inst(11 downto 8);
end if;
when "0010" => -- STORE rn, [rm, imm]
if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher
inst_next <= inst;
cpu_state_next <= WAIT_MEM;
else
mem_write <= '1';
mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
mem_out <= reg_q(regn_0);
end if;
--- ALU stuff
when "0011" => do_alu := '1'; -- ADD rd, rn, rm (rd := rn + rm)
when "0100" => do_alu := '1'; -- SUB rd, rn, rm (rd := rn - rm)
when "0101" => do_alu := '1'; -- OR rd, rn, rm (rd := rn or rm)
when "0110" => do_alu := '1'; -- AND rd, rn, rm (rd := rn and rm)
when "0111" => do_alu := '1'; -- NOT rd, rn (rd := not rn)
when "1000" => do_alu := '1'; -- XOR rd, rn, rm (rd := rn xor rm)
when "1001" => -- SETH rd, imm
reg_d(regn_0)(15 downto 8) <= inst(7 downto 0);
when "1010" => -- SHR rd, rn, imm (rd := rn >> imm)
alu_sel <= inst(15 downto 12);
alu_a <= reg_q(regn_1);
alu_b <= x"000" & inst(3 downto 0);
reg_d(regn_0) <= alu_q;
when "1011" => do_alu := '1'; -- MUL rd, rn, rm (rd := rn * rm)
when "1100" => -- CMP rn, rm (flag := 1 if equal)
alu_sel <= "1100";
alu_a <= reg_q(regn_0);
alu_b <= reg_q(regn_1);
reg_d(15)(0) <= alu_flag;
when "1101" => -- BEQ imm (jump to [pc, imm] if flag is set, imm is signed 12 bits)
if reg_q(15)(0) = '1' then
reg_d(14) <= std_logic_vector(signed(reg_q(14)) + signed(inst(11 downto 0) & '0'));
cpu_state_next <= BRANCH;
end if;
when "1110" => -- SET rd, imm (rd := imm, imm is 8 bit)
reg_d(regn_0) <= x"00" & inst(7 downto 0);
when "1111" => -- BNEQ imm
if reg_q(15)(0) = '0' then
reg_d(14) <= std_logic_vector(signed(reg_q(14)) + signed(inst(11 downto 0) & '0'));
cpu_state_next <= BRANCH;
end if;
when others => -- do nothing
end case;
if do_alu = '1' then
-- 1:1 mapping
alu_sel <= inst(15 downto 12);
alu_a <= reg_q(regn_1);
alu_b <= reg_q(regn_2);
reg_d(regn_0) <= alu_q;
reg_d(15)(0) <= alu_flag;
if inst(11 downto 8) = x"e" then
cpu_state_next <= BRANCH;
end if;
end if;
end if;
end process;

View File

@ -92,7 +92,7 @@ def generate_ops(ops, labels, relocs):
if isinstance(p, str): # label ref
if len(params) == 1: # branch
yield 14 # pc
yield labels[p] - pc - 2
yield labels[p] - pc - 4
else: # set, allow relocs here
relocs.append((pc, p))
yield 0xff

View File

@ -473,10 +473,10 @@ class ShlOp(BinOp):
return [f'set {sc1}, 1',
f'or {self.dest}, {self.left}, {self.left}',
f'sub {sc0}, {self.right}, {sc1}',
f'beq [pc, 6]',
f'beq [pc, 4]',
f'add {self.dest}, {self.dest}, {self.dest}',
f'sub {sc0}, {sc0}, {sc1}',
f'bneq [pc, -6]']
f'bneq [pc, -8]']
class LtOp(BinOp):
@ -485,7 +485,7 @@ class LtOp(BinOp):
sc0 = scratches[0]
return [f'set {self.dest}, 0',
f'sub {sc0}, {self.left}, {self.right}',
f'bneq [pc, 2]',
f'bneq [pc, 0]',
f'set {self.dest}, 1']
class GtOp(LtOp):
@ -531,7 +531,7 @@ class BoolNot(UnOp):
def synth(self, scratches):
return [f'set {self.dest}, 0',
f'cmp {self.dest}, {self.operand}',
f'bneq [pc, 2]',
f'bneq [pc, 0]',
f'set {self.dest}, 1']
class NeqOp(BinOp):
@ -555,7 +555,7 @@ class FnCall(AsmOp):
sc0 = scratches[0]
fn = self.dest_fn
return out + [f'set {sc0}, 2',
return out + [f'set {sc0}, 0',
f'add lr, pc, {sc0}',
f'or pc, {fn}, {fn}']
@ -1375,10 +1375,11 @@ preamble = [f'_start:',
f'set sp, 0',
f'seth sp, {0x11}', # 256 bytes of stack ought to be enough
f'set r2, main',
f'set r3, 2',
f'set r3, 0',
f'add lr, pc, r3',
f'or pc, r2, r2',
f'or pc, pc, pc // loop forever',
f'cmp r0, r0',
f'beq [pc, -4] // loop forever',
]
def filter_dupes(ops):