/*----------------------------------------------------------------------------

CPUTestBypass

This is a testbench for FiveStageCPUBypass.

The testbench executes a simple program which should have the following
trace through the pipeline:

(In this picture, the items under Fetch are the items which will
be next loaded from memory.  The items under decode indicate
the item waiting in the BF buffer.  The items under the other
stages indicate the items waiting in the buffers before those
stages.  Thus, for any item, there needs to be one cycle of no
stall before the item moves to the next column.  The first Add
instruction stalls twice, thus it is at the head of the BF buffer
for three cycles.)

Time   Fetch      Decode    Execute     Memory   Writeback
----+----------+----------+----------+----------+----------+
  0 | LoadC R0 |          |          |          |          |
    |          |          |          |          |          |
----+----------+----------+----------+----------+----------+
  1 | LoadC R1 | LoadC R0 |          |          |          |
    |          |          |          |          |          |
----+----------+----------+----------+----------+----------+
  2 | LoadC R2 | LoadC R1 | LoadC R0 |          |          |
    |          |          |          |          |          |
----+----------+----------+----------+----------+----------+
  3 | Add R3   | LoadC R2 | LoadC R1 | LoadC R0 |          |
    |   R0 R1  |          |          |          |          |
----+----------+----------+----------+----------+----------+
  4 | Add R4   | Add R3   | LoadC R2 | LoadC R1 | LoadC R0 | Bypass: R0 and R1
    |   R3 R2  |   R0 R1  |          |          |          |  available **
----+----------+----------+----------+----------+----------+
  5 | Store    | Add R4   | Add R3   | LoadC R2 | LoadC R1 | Stall: R3 not
    |   R4 R1  |   R3 R2  |   R0 R1  |          |          |  loaded yet
----+----------+----------+----------+----------+----------+
  6 | Store    | Add R4   |          | Add R3   | LoadC R2 | Bypass: R3 and R2
    |   R4 R1  |   R3 R2  |          |   R0 R1  |          |  available
----+----------+----------+----------+----------+----------+
  7 | LoadC R5 | Store    | Add R4   |          | Add R3   | Stall: R4 not
    |          |   R4 R1  |   R3 R2  |          |   R0 R1  |  loaded yet
----+----------+----------+----------+----------+----------+
  8 | LoadC R5 | Store    |          | Add R4   |          | Bypass: R4
    |          |   R4 R1  |          |   R3 R2  |          |  available
----+----------+----------+----------+----------+----------+
  9 | Jz R0 R5 | LoadC R5 | Store    |          | Add R4   |
    |          |          |   R4 R1  |          |   R3 R2  |
----+----------+----------+----------+----------+----------+
 10 | Add R4   | Jz R0 R5 | LoadC R5 | Store    |          | Stall: R5 not
    |   R4 R4  |          |          |   R4 R1  |          |  available ***
----+----------+----------+----------+----------+----------+
 11 | Add R4   | Jz R0 R5 |          | LoadC R5 | Store    | Bypass: R5
    |   R4 R4  |          |          |          |   R4 R1  |  available
----+----------+----------+----------+----------+----------+
 12 | Store    | Add R4   | Jz R0 R5 |          | LoadC R5 | Jump clears PC
    |   R4 R0  |   R4 R4  |          |          |          | (decode stalls,
----+----------+----------+----------+----------+----------+  and no fetch)
 13 | Store    |          |          |          |          |
    |   R4 R0  |          |          |          |          |
----+----------+----------+----------+----------+----------+
 14 | Halt     | Store    |          |          |          |
    |          |   R4 R0  |          |          |          |
----+----------+----------+----------+----------+----------+
 15 |          | Halt     | Store    |          |          |
    |          |          |   R4 R0  |          |          |
----+----------+----------+----------+----------+----------+
 16 |          | Halt     |          | Store    |          |
    |          |          |          |   R4 R0  |          |
----+----------+----------+----------+----------+----------+

** If bypassing is improperly implemented, this instruction could get
the wrong value for R0.  Remember that writes to R0 are ignored, so
bypassing should ignore R0.

*** In this CPU, we have chosen not to bypass from the buffer after
the decode stage.  However, the LoadC instruction could be bypassed
from that point (no computation is needed to determine the value to
be loaded).  We have no chosen to implement this, so a stall occurs
at the indicated point.

And as observed, there are 17 cycles in the operation of the CPU.
The total simulation time is 11 cycles of writing to the instruction
memory, 1 cycle to start the CPU, and 17 cycles of operation = 29.
As expected, the simulation runs from 0 to 290 (cycle time is 10).

-----------------------------------------------------------------------------*/

import FiveStageCPUBypass::*;

typedef Bit#(32) MemAddr;

typedef union tagged {
    MemAddr WriteIMem;
    MemAddr WriteDMem;
    void Start;
    void Running;
} TestStage deriving (Eq, Bits);

module mkCPUTestBypass(Empty);
  CPU cpu();
  mkFiveStageCPUBypass the_cpu(cpu);

  Reg#(TestStage) state();
  mkReg#(WriteIMem(0)) the_state(state);

  MemAddr maxInstr = 10;
  function Instr nextInstr(MemAddr n);
   case (n)
      0 :  return (tagged LoadC {rd:R0, v:10});
      1 :  return (tagged LoadC {rd:R1, v:15});
      2 :  return (tagged LoadC {rd:R2, v:20});
      3 :  return (tagged Add {rd:R3, ra:R0, rb:R1});
      4 :  return (tagged Add {rd:R4, ra:R3, rb:R2});
      5 :  return (tagged Store {v:R4, addr:R1});
      6 :  return (tagged LoadC {rd:R5, v: 9});
      7 :  return (tagged Jz {cd:R0, addr:R5});
      8 :  return (tagged Add {rd:R4, ra:R4, rb:R4});
      9 :  return (tagged Store {v:R4, addr:R0});
     10 :  return (tagged Halt);
   endcase

// Program involving potential new instruction:
//   case (n)
//     0 :  return (tagged LoadC {rd:R0, v:10});
//     1 :  return (tagged LoadC {rd:R1, v:15});
//     2 :  return (tagged LoadPC {rd:R3});
//     3 :  return (tagged LShift {rd:R5, ra:R3, rb:R3});
//     4 :  return (tagged Store {v:R5, addr:R6});
//     5 :  return (tagged Add {rd:R2, ra:R0, rb:R1});
//     6 :  return (tagged Store {v:R2, addr:R3});
//   endcase
  endfunction: nextInstr

  rule writing_InstrMem (state matches (tagged WriteIMem .n));
     cpu.imem.put(n, zeroExtend(pack(nextInstr(n))));
     state <= (n == maxInstr ? Start : WriteIMem (n + 1));
  endrule: writing_InstrMem

  rule starting_CPU (state matches Start);
     cpu.start;
     state <= Running;
  endrule: starting_CPU

  rule done (state matches Running &&& cpu.done);
     $display("DMem location %d has value %d at time %d",
	      0, cpu.dmem.get(0), $stime);
     $finish(0);
  endrule: done

endmodule: mkCPUTestBypass