cpu: more instruction pipelining

This is needed to make WNS room for fetching instructions from SRAM.
This commit is contained in:
Paul Mathieu 2021-04-17 23:02:58 -07:00
parent 6825ce464f
commit 36bc1417b6
3 changed files with 99 additions and 92 deletions

View File

@ -49,7 +49,7 @@ architecture behavior of cpu is
signal load_reg_next, load_reg: std_logic_vector(15 downto 0); signal load_reg_next, load_reg: std_logic_vector(15 downto 0);
signal load_addr_next, load_addr: std_logic_vector(15 downto 0); signal load_addr_next, load_addr: std_logic_vector(15 downto 0);
signal hold_inst_next, hold_inst: std_logic_vector(15 downto 0); signal inst_next, inst: std_logic_vector(15 downto 0);
type regbank is array(0 to 15) of std_logic_vector(15 downto 0); type regbank is array(0 to 15) of std_logic_vector(15 downto 0);
signal reg_d: regbank; signal reg_d: regbank;
@ -62,7 +62,6 @@ begin
load_reg_r: reg port map(clk => clk, rst => rst, d => load_reg_next, q => load_reg); load_reg_r: reg port map(clk => clk, rst => rst, d => load_reg_next, q => load_reg);
load_addr_r: reg port map(clk => clk, rst => rst, d => load_addr_next, q => load_addr); load_addr_r: reg port map(clk => clk, rst => rst, d => load_addr_next, q => load_addr);
hold_inst_r: reg port map(clk => clk, rst => rst, d => hold_inst_next, q => hold_inst);
allregs: allregs:
for i in 0 to 15 generate for i in 0 to 15 generate
@ -73,15 +72,16 @@ begin
begin begin
if rst = '1' then if rst = '1' then
cpu_state <= BRANCH; -- wait a cycle at first cpu_state <= BRANCH; -- wait a cycle at first
inst <= x"0000";
elsif rising_edge(clk) then elsif rising_edge(clk) then
cpu_state <= cpu_state_next; cpu_state <= cpu_state_next;
inst <= inst_next;
end if; end if;
end process; end process;
code_addr <= reg_q(14); code_addr <= reg_q(14);
process(code_data, reg_q, mem_in, mem_busy, alu_q, alu_flag, cpu_state, load_addr, load_reg, hold_inst) is process(code_data, reg_q, mem_in, mem_busy, alu_q, alu_flag, cpu_state, load_addr, load_reg, inst) is
variable inst: std_logic_vector(15 downto 0);
variable regn_0: natural; variable regn_0: natural;
variable regn_1: natural; variable regn_1: natural;
variable regn_2: natural; variable regn_2: natural;
@ -110,9 +110,9 @@ begin
case cpu_state is case cpu_state is
when RUN => when RUN =>
reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2); reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2);
inst := code_data; inst_next <= code_data;
when LOAD => when LOAD =>
inst := x"0000"; -- NOP inst_next <= inst;
mem_addr <= load_addr; -- maintain this until we're done reading mem_addr <= load_addr; -- maintain this until we're done reading
if load_reg(3 downto 0) = x"e" then if load_reg(3 downto 0) = x"e" then
cpu_state_next <= BRANCH; cpu_state_next <= BRANCH;
@ -127,93 +127,99 @@ begin
reg_d(regn_0) <= mem_in; reg_d(regn_0) <= mem_in;
end if; end if;
when BRANCH => when BRANCH =>
inst := x"0000"; -- NOP inst_next <= x"0000"; -- NOP
reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2); reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2);
when WAIT_MEM => when WAIT_MEM =>
cpu_state_next <= RUN; inst_next <= inst;
inst := hold_inst;
reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2); reg_d(14) <= std_logic_vector(unsigned(reg_q(14)) + 2);
end case;
hold_inst_next <= inst;
regn_0 := to_integer(unsigned(inst(11 downto 8)));
regn_1 := to_integer(unsigned(inst(7 downto 4)));
regn_2 := to_integer(unsigned(inst(3 downto 0)));
case inst(15 downto 12) is
when "0000" => -- NOP
when "0001" => -- LOAD rn, [rm, imm] (imm is signed 4 bits)
if mem_busy = '1' then if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher reg_d(14) <= reg_q(14); -- halt the prefetcher
cpu_state_next <= WAIT_MEM; cpu_state_next <= WAIT_MEM;
else
mem_read <= '1';
cpu_state_next <= LOAD;
reg_d(14) <= reg_q(14); -- halt the prefetcher
load_addr_next <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
load_reg_next(3 downto 0) <= inst(11 downto 8);
end if; end if;
when "0010" => -- STORE rn, [rm, imm]
if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher
cpu_state_next <= WAIT_MEM;
else
mem_write <= '1';
mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
mem_out <= reg_q(regn_0);
end if;
--- ALU stuff
when "0011" => do_alu := '1'; -- ADD rd, rn, rm (rd := rn + rm)
when "0100" => do_alu := '1'; -- SUB rd, rn, rm (rd := rn - rm)
when "0101" => do_alu := '1'; -- OR rd, rn, rm (rd := rn or rm)
when "0110" => do_alu := '1'; -- AND rd, rn, rm (rd := rn and rm)
when "0111" => do_alu := '1'; -- NOT rd, rn (rd := not rn)
when "1000" => do_alu := '1'; -- XOR rd, rn, rm (rd := rn xor rm)
when "1001" => -- SETH rd, imm
reg_d(regn_0)(15 downto 8) <= inst(7 downto 0);
when "1010" => -- SHR rd, rn, imm (rd := rn >> imm)
alu_sel <= inst(15 downto 12);
alu_a <= reg_q(regn_1);
alu_b <= x"000" & inst(3 downto 0);
reg_d(regn_0) <= alu_q;
when "1011" => do_alu := '1'; -- MUL rd, rn, rm (rd := rn * rm)
when "1100" => -- CMP rn, rm (flag := 1 if equal)
alu_sel <= "1100";
alu_a <= reg_q(regn_0);
alu_b <= reg_q(regn_1);
reg_d(15)(0) <= alu_flag;
when "1101" => -- BEQ imm (jump to [pc, imm] if flag is set, imm is signed 12 bits)
if reg_q(15)(0) = '1' then
reg_d(14) <= std_logic_vector(signed(reg_q(14)) + signed(inst(11 downto 0) & '0'));
cpu_state_next <= BRANCH;
end if;
when "1110" => -- SET rd, imm (rd := imm, imm is 8 bit)
reg_d(regn_0) <= x"00" & inst(7 downto 0);
when "1111" => -- BNEQ imm
if reg_q(15)(0) = '0' then
reg_d(14) <= std_logic_vector(signed(reg_q(14)) + signed(inst(11 downto 0) & '0'));
cpu_state_next <= BRANCH;
end if;
when others => -- do nothing
end case; end case;
if do_alu = '1' then if cpu_state = RUN then
-- 1:1 mapping regn_0 := to_integer(unsigned(inst(11 downto 8)));
alu_sel <= inst(15 downto 12); regn_1 := to_integer(unsigned(inst(7 downto 4)));
alu_a <= reg_q(regn_1); regn_2 := to_integer(unsigned(inst(3 downto 0)));
alu_b <= reg_q(regn_2);
reg_d(regn_0) <= alu_q; case inst(15 downto 12) is
reg_d(15)(0) <= alu_flag; when "0000" => -- NOP
if inst(11 downto 8) = x"e" then when "0001" => -- LOAD rn, [rm, imm] (imm is signed 4 bits)
cpu_state_next <= BRANCH; if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher
inst_next <= inst;
cpu_state_next <= WAIT_MEM;
else
mem_read <= '1';
cpu_state_next <= LOAD;
reg_d(14) <= reg_q(14); -- halt the prefetcher
-- inst_next <= inst;
load_addr_next <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
load_reg_next(3 downto 0) <= inst(11 downto 8);
end if;
when "0010" => -- STORE rn, [rm, imm]
if mem_busy = '1' then
reg_d(14) <= reg_q(14); -- halt the prefetcher
inst_next <= inst;
cpu_state_next <= WAIT_MEM;
else
mem_write <= '1';
mem_addr <= std_logic_vector(signed(reg_q(regn_1)) + signed(inst(3 downto 0) & '0'));
mem_out <= reg_q(regn_0);
end if;
--- ALU stuff
when "0011" => do_alu := '1'; -- ADD rd, rn, rm (rd := rn + rm)
when "0100" => do_alu := '1'; -- SUB rd, rn, rm (rd := rn - rm)
when "0101" => do_alu := '1'; -- OR rd, rn, rm (rd := rn or rm)
when "0110" => do_alu := '1'; -- AND rd, rn, rm (rd := rn and rm)
when "0111" => do_alu := '1'; -- NOT rd, rn (rd := not rn)
when "1000" => do_alu := '1'; -- XOR rd, rn, rm (rd := rn xor rm)
when "1001" => -- SETH rd, imm
reg_d(regn_0)(15 downto 8) <= inst(7 downto 0);
when "1010" => -- SHR rd, rn, imm (rd := rn >> imm)
alu_sel <= inst(15 downto 12);
alu_a <= reg_q(regn_1);
alu_b <= x"000" & inst(3 downto 0);
reg_d(regn_0) <= alu_q;
when "1011" => do_alu := '1'; -- MUL rd, rn, rm (rd := rn * rm)
when "1100" => -- CMP rn, rm (flag := 1 if equal)
alu_sel <= "1100";
alu_a <= reg_q(regn_0);
alu_b <= reg_q(regn_1);
reg_d(15)(0) <= alu_flag;
when "1101" => -- BEQ imm (jump to [pc, imm] if flag is set, imm is signed 12 bits)
if reg_q(15)(0) = '1' then
reg_d(14) <= std_logic_vector(signed(reg_q(14)) + signed(inst(11 downto 0) & '0'));
cpu_state_next <= BRANCH;
end if;
when "1110" => -- SET rd, imm (rd := imm, imm is 8 bit)
reg_d(regn_0) <= x"00" & inst(7 downto 0);
when "1111" => -- BNEQ imm
if reg_q(15)(0) = '0' then
reg_d(14) <= std_logic_vector(signed(reg_q(14)) + signed(inst(11 downto 0) & '0'));
cpu_state_next <= BRANCH;
end if;
when others => -- do nothing
end case;
if do_alu = '1' then
-- 1:1 mapping
alu_sel <= inst(15 downto 12);
alu_a <= reg_q(regn_1);
alu_b <= reg_q(regn_2);
reg_d(regn_0) <= alu_q;
reg_d(15)(0) <= alu_flag;
if inst(11 downto 8) = x"e" then
cpu_state_next <= BRANCH;
end if;
end if; end if;
end if; end if;
end process; end process;

View File

@ -92,7 +92,7 @@ def generate_ops(ops, labels, relocs):
if isinstance(p, str): # label ref if isinstance(p, str): # label ref
if len(params) == 1: # branch if len(params) == 1: # branch
yield 14 # pc yield 14 # pc
yield labels[p] - pc - 2 yield labels[p] - pc - 4
else: # set, allow relocs here else: # set, allow relocs here
relocs.append((pc, p)) relocs.append((pc, p))
yield 0xff yield 0xff

View File

@ -473,10 +473,10 @@ class ShlOp(BinOp):
return [f'set {sc1}, 1', return [f'set {sc1}, 1',
f'or {self.dest}, {self.left}, {self.left}', f'or {self.dest}, {self.left}, {self.left}',
f'sub {sc0}, {self.right}, {sc1}', f'sub {sc0}, {self.right}, {sc1}',
f'beq [pc, 6]', f'beq [pc, 4]',
f'add {self.dest}, {self.dest}, {self.dest}', f'add {self.dest}, {self.dest}, {self.dest}',
f'sub {sc0}, {sc0}, {sc1}', f'sub {sc0}, {sc0}, {sc1}',
f'bneq [pc, -6]'] f'bneq [pc, -8]']
class LtOp(BinOp): class LtOp(BinOp):
@ -485,7 +485,7 @@ class LtOp(BinOp):
sc0 = scratches[0] sc0 = scratches[0]
return [f'set {self.dest}, 0', return [f'set {self.dest}, 0',
f'sub {sc0}, {self.left}, {self.right}', f'sub {sc0}, {self.left}, {self.right}',
f'bneq [pc, 2]', f'bneq [pc, 0]',
f'set {self.dest}, 1'] f'set {self.dest}, 1']
class GtOp(LtOp): class GtOp(LtOp):
@ -531,7 +531,7 @@ class BoolNot(UnOp):
def synth(self, scratches): def synth(self, scratches):
return [f'set {self.dest}, 0', return [f'set {self.dest}, 0',
f'cmp {self.dest}, {self.operand}', f'cmp {self.dest}, {self.operand}',
f'bneq [pc, 2]', f'bneq [pc, 0]',
f'set {self.dest}, 1'] f'set {self.dest}, 1']
class NeqOp(BinOp): class NeqOp(BinOp):
@ -555,7 +555,7 @@ class FnCall(AsmOp):
sc0 = scratches[0] sc0 = scratches[0]
fn = self.dest_fn fn = self.dest_fn
return out + [f'set {sc0}, 2', return out + [f'set {sc0}, 0',
f'add lr, pc, {sc0}', f'add lr, pc, {sc0}',
f'or pc, {fn}, {fn}'] f'or pc, {fn}, {fn}']
@ -1375,10 +1375,11 @@ preamble = [f'_start:',
f'set sp, 0', f'set sp, 0',
f'seth sp, {0x11}', # 256 bytes of stack ought to be enough f'seth sp, {0x11}', # 256 bytes of stack ought to be enough
f'set r2, main', f'set r2, main',
f'set r3, 2', f'set r3, 0',
f'add lr, pc, r3', f'add lr, pc, r3',
f'or pc, r2, r2', f'or pc, r2, r2',
f'or pc, pc, pc // loop forever', f'cmp r0, r0',
f'beq [pc, -4] // loop forever',
] ]
def filter_dupes(ops): def filter_dupes(ops):