library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity uart is
  generic
  (
    baudrate : in natural := 1_000_000
  );

  port
  (
    clk     : in std_logic;
    rst     : in std_logic;

    -- hardware
    rx_pin  : in std_logic;
    tx_pin  : out std_logic;

    -- bus interface
    we      : in std_logic;
    re      : in std_logic;
    addr    : in std_logic_vector(15 downto 0);
    din     : in std_logic_vector(15 downto 0);
    dout    : out std_logic_vector(15 downto 0)
  );
end uart;

--
-- Mem layout:
--  0x00: 8-bit read/write
--  0x02: flags: [rxne, txe]

-- Rx FIFO 4 bytes long
-- Tx FIFO 4 bytes long

-- Mnemonic: receive from the left, transmit to the right

architecture Behavioral of uart is
  constant BAUD: positive := baudrate;
  constant SYSFREQ: natural := 100_000_000;
  constant RXCLKCNT: natural := SYSFREQ / BAUD / 3;
  constant TXCLKCNT: natural := SYSFREQ / BAUD;

  type Fifo is array(integer range <>) of std_logic_vector(7 downto 0);

  ---- all dffs below

  signal rxfifo : Fifo(0 to 7);
  signal txfifo : Fifo(3 downto 0);

  signal rxcnt  : unsigned(3 downto 0);
  signal txcnt  : unsigned(3 downto 0);

  signal rxclk : unsigned(15 downto 0);  -- possibly down to 1525 baud
  signal txclk : unsigned(15 downto 0);  -- possibly down to 1525 baud
  signal rxen : std_logic;  -- 1 when rx state machine can do stuff
  signal txen : std_logic;  -- 1 when tx state machine can do stuff

  type RxState_t is (IDLE, SHIFT_IN);
  type TxState_t is (IDLE, SHIFT_OUT);

  signal rxstate: RxState_t;
  signal txstate: TxState_t;

  signal txpopped : std_logic;
  signal rxpushed : std_logic;

  signal txshift : std_logic_vector(7 downto 0);
  signal rxshift : std_logic_vector(7 downto 0);

  signal txshiftcnt : unsigned(3 downto 0);
  signal rxshiftcnt : unsigned(3 downto 0);

  signal clockoffset : unsigned(15 downto 0);
  signal rxpinprev: std_logic_vector(1 downto 0);
  signal rxsamplecount: unsigned(2 downto 0);
begin

  -- rx process
  -- drives rxstate, rxpushed, rxshift, rxshiftcnt, rxfifo, clockoffset
  process(clk, rst)
  begin
    if rst = '1' then
      rxstate <= IDLE;
      rxpushed <= '0';
      rxshift <= x"00";
      rxshiftcnt <= "0000";
      clockoffset <= (others => '0');
      rxpinprev <= (others => '0');
      rxsamplecount <= (others => '0');

      for i in rxfifo'low to rxfifo'high loop
        rxfifo(i) <= x"00";
      end loop;
    elsif rising_edge(clk) then
      if rxen = '1' and unsigned(rxsamplecount) < 2 then
        rxpinprev(to_integer(unsigned(rxsamplecount))) <= rx_pin;
        rxsamplecount <= rxsamplecount + 1;
      end if;

      if rxen = '1' and unsigned(rxsamplecount) = 2 then
        rxpushed <= '0';
        rxsamplecount <= (others => '0');

        if rxpinprev(0) /= rxpinprev(1) then
          -- we are too fast, slow down
          clockoffset <= clockoffset - RXCLKCNT / 2;
        elsif rx_pin /= rxpinprev(1) then
          -- we are too slow, speed up
          clockoffset <= clockoffset + RXCLKCNT / 2;
        end if;

        -- use rxpinprev(1) as the value
        case rxstate is
          when IDLE =>
            if rxpinprev(1) = '0' then  -- start bit!! (hopefully)
              rxshift <= x"00";
              rxshiftcnt <= "0000";
              rxstate <= SHIFT_IN;
            end if;
          when SHIFT_IN =>
            if rxshiftcnt = 8 then
              -- by now we should be seeing the stop bit, check
              if rxpinprev(1) = '1' then -- all good, push away
                for i in rxfifo'high - 1 downto rxfifo'low loop  -- right to left
                  rxfifo(i + 1) <= rxfifo(i);
                end loop;
                rxfifo(0) <= rxshift;
                rxpushed <= '1';
              end if;
              rxstate <= IDLE;  -- either way, we're done
            else
              rxshift <= rxpinprev(1) & rxshift(7 downto 1);
              rxshiftcnt <= rxshiftcnt + 1;
            end if;
        end case;
      end if;
    end if;
  end process;

  -- tx process
  -- drives txstate, txpopped, txshift, txshiftcnt, tx_pin
  process(clk, rst)
  begin
    if rst = '1' then
      txstate <= IDLE;
      txpopped <= '0';
      txshift <= x"00";
      txshiftcnt <= "0000";
      tx_pin <= '1';
    elsif rising_edge(clk) then
      if txen = '1' then
        txpopped <= '0';

        case txstate is
          when IDLE =>
            if txcnt > 0 then
              txshiftcnt <= "0000";
              tx_pin <= '0';  -- start bit
              txstate <= SHIFT_OUT;
              txshift <= txfifo(0);
              txpopped <= '1';
            else
              tx_pin <= '1';
            end if;
          when SHIFT_OUT =>
            if txshiftcnt = 8 then
              tx_pin <= '1';
              txstate <= IDLE;
            else
              txshiftcnt <= txshiftcnt + 1;
              tx_pin <= txshift(0);
              txshift(6 downto 0) <= txshift(7 downto 1);
            end if;
        end case;
      end if;
    end if;
  end process;

  process(clk, rst)  -- drives rxclk, rxen
  begin
    if rst = '1' then
      rxclk <= (others => '0');
      rxen <= '0';
    elsif rising_edge(clk) then
      if unsigned(rxclk + clockoffset) = RXCLKCNT - 1 then
        rxclk <=  0 - clockoffset;
        rxen <= '1';
      else
        rxclk <= rxclk + 1;
        rxen <= '0';
      end if;
    end if;
  end process;

  process(clk, rst)  -- drives txclk, rxen
  begin
    if rst = '1' then
      txclk <= (others => '0');
      txen <= '0';
    elsif rising_edge(clk) then
      if unsigned(txclk) = TXCLKCNT - 1 then
        txclk <=  (others => '0');
        txen <= '1';
      else
        txclk <= txclk + 1;
        txen <= '0';
      end if;
    end if;
  end process;

  process(clk, rst)  -- drives dout, rxcnt, txcnt, txfifo
    variable txn: unsigned(3 downto 0);
    variable rxn: unsigned(3 downto 0);

    variable txpopdone  : std_logic := '0'; -- latch
    variable rxpushdone : std_logic := '0'; -- latch
  begin

    if rst = '1' then
      for i in txfifo'low to txfifo'high loop
        txfifo(i) <= x"00";
      end loop;

      rxcnt <= (others => '0');
      txcnt <= (others => '0');

      txpopdone := '0';
      rxpushdone := '0';
    elsif rising_edge(clk) then
      rxn := rxcnt;
      txn := txcnt;

      dout <= x"0000";

      -- Fifo grooming
      if txpopped = '1' then
        if txpopdone = '0' then
          -- shift our fifo to the right
          for i in txfifo'high to txfifo'low - 1 loop  -- 0 to 2 is right to left
            txfifo(i) <= txfifo(i + 1);
          end loop;
          txpopdone := '1';
          txn := txcnt - 1;
        end if;
      else
        txpopdone := '0';
      end if;

      if rxpushed = '1' then
        if rxpushdone = '0' then
          -- our fifo was already moved, just update rxn
          if rxcnt < 4 then
            rxn := rxcnt + 1;
          end if;
          rxpushdone := '1';
        end if;
      else
        rxpushdone := '0';
      end if;

      txcnt <= txn;
      rxcnt <= rxn;

      -- logic here
      case addr is
        when x"0000" =>
          if we = '1' then
            if to_integer(txn) <= txfifo'low then
              txfifo(to_integer(txn)) <= din(7 downto 0);
              txcnt <= txn + 1;
            end if;
          elsif re = '1' then
            if to_integer(rxn) > 0 then
              dout(7 downto 0) <= rxfifo(to_integer(rxn) - 1);
              rxcnt <= rxn - 1;
            end if;
          end if;
        when x"0002" =>
          if to_integer(rxn) > 0 then
            dout(1) <= '1';
          else
            dout(1) <= '0';
          end if;
          if to_integer(txn) = 0 then
            dout(0) <= '1';
          else
            dout(0) <= '0';
          end if;
        when others =>
          -- nada
      end case;
    end if;
  end process;

end Behavioral;