package MesaCircLpm;

// ----------------------------------------------------------------
//
// The LPM module is responsible for taking 32-bit IP addresses, looking up
// the destination (32-bit data) for each IP address in a table in an SRAM,
// and returning the destinations.
//
// ----------------------------------------------------------------
//
// ILpm is the interface to the LPM module, containing two
// sub-interfaces: mif and ram.  It is defined in the interface definition
// package, MesaIDefs, imported above.
//
// ----------------------------------------------------------------
//
// The LPM module sits between two other blocks:
//   * the MIF (Media Interface)
//   * the SRAM
//
// The LPM receives requests from the MIF module by the method
//     mif.put (LuRequest, LuTag)
// and returns results (some cycles later) of the form (luResponse, luTag) by
// the method
//     mif.get.
//
// The LPM sends addresses to the RAM by calling
//     ram.put(sramAddr)
// and receives result data (some cycles later) from the RAM by the method
//     ram.get
//
// ----------------------------------------------------------------
//
// The longest prefix match traverses a tree representation in  memory. The
// first step is a table lookup, and after that it iterates if necessary until
// a leaf is reached.
//
// ----------------------------------------------------------------
//
// The module is pipelined, i.e., IP addresses stream in, and results stream
// out (after some latency).  Results are returned in the same order as
// requests.
//
// The SRAM is also pipelined: addresses stream in, and data stream out
// (after some latency).  It can accept addresses and deliver data on every
// cycle.
//
// Performance metric: as long as IP lookup requests are available, the LPM
// module must keep the SRAM 100% utilized, i.e., it should issue a request
// into the SRAM on every cycle.
//
// ----------------------------------------------------------------
//
// This version uses a circular pipeline, with one memory access each cycle
// for each circulating value.
//
// ----------------------------------------------------------------

// Required library packages:
import FIFO::*;
import RAM::*;
import ClientServer::*;
import CompletionBuffer::*;
import GetPut::*;
import CGetPut::*;
import List::*;
import Connectable::*;

// Other Mesa packages:
import MesaDefs::*;
import MesaIDefs::*;

// The type of results from the table lookup:

typedef union tagged {
    Bit#(21) Pointer;
    Bit#(31) Leaf;
} TableType deriving (Eq, Bits);

// The first lookup consumes the first 16 bits of the IP address.  The
// following type is for the remainder -- either two eight-bit chunks (after
// the first lookup) or just one (after the second):

typedef union tagged {
    Tuple2#(Bit#(8),Bit#(8)) L1;
    Bit#(8) L2;
    void L3;
} IP deriving (Eq, Bits);

// Latency is the total latency for the Lpm; this includes all steps in
// the process and the memory latency.  If latency is too low then the
// external ram will not be fully utilized. If the latency is too high
// then unnecessarily large FIFOs are created.

typedef 15 Latency;

// Longest prefix matching is done on several requests at the same time. A
// completion buffer ensures that the output is in the same order as the
// requests. The completion buffer also limits the number of requests worked
// on at the same time.

typedef Latency CompletionBufferSize;
typedef CBToken#(CompletionBufferSize) CompletionToken;

(* synthesize *)
module mkMesaLpm(ILpm);
   // registers for debugging purposes:
   Reg#(LuRequest) requestB32();
   mkRegU the_requestB32(requestB32);

   Reg#(LuTag) requestTag();
   mkRegU the_requestTag(requestTag);

   Reg#(LuResponse) responseB32();
   mkRegU the_responseB32(responseB32);

   Reg#(LuTag) responseTag();
   mkRegU the_responseTag(responseTag);

   // Technical detail: Latency is actually a numeric type.  We now define
   // an integer with corresponding value:
   Integer sz = valueOf(Latency);

   // the FIFOs for requests to and responses from the RAM:
   FIFO#(SramAddr) sramReq();
   mkSizedFIFO#(2) the_sramReq(sramReq);

   FIFO#(SramData) sramResp();
   mkSizedFIFO#(sz + 2) the_sramResp(sramResp);

   // The FIFO for input requests:
   FIFO#(Tuple2#(LuRequest, LuTag)) ififo();
   mkSizedFIFO#(sz) the_ififo(ififo);

   // The FIFO for work in progress:
   FIFO#(Tuple3#(IP, LuTag, CompletionToken)) fifo();
   mkSizedFIFO#(sz + 2) the_fifo(fifo);

   // The Completion Buffer, which holds completed results
   CompletionBuffer#(CompletionBufferSize, Tuple2#(LuResponse, LuTag)) completionBuffer();
   mkCompletionBuffer the_completionBuffer(completionBuffer);

   // A FIFO for completion tokens from the Completion Buffer:
   FIFO#(CompletionToken) tagfifo();
   mkSizedFIFO#(sz+2) the_tagfifo(tagfifo);

   // populate the tagfifo:
   rule get_tags;
      let tag <- completionBuffer.reserve.get;
      tagfifo.enq(tag);
   endrule

   // All processing starts with a table lookup. This rule also claims a
   // completion token from the completion buffer:
   rule stage0;
      match {.ireq,.ilutag} = ififo.first;
      ififo.deq;
      let tag = tagfifo.first;
      tagfifo.deq;

      // We send a request to the RAM, and also enqueue (for the next
      // stage) the remainder of the address, the lookup-tag and the
      // completion-buffer tag.  (Ln is the remainder after the nth
      // lookup).
      fifo.enq(tuple3(L1 (tuple2(ireq[15:8], ireq[7:0])), ilutag, tag));
      sramReq.enq(zeroExtend(ireq[31:16]));
   endrule: stage0

   // Processing stops if we found a Leaf.
   rule stage1_Leaf (unpack(sramResp.first) matches tagged Leaf (.v));
      let {ip,lutag,itag} = fifo.first;
      completionBuffer.complete.put(tuple2(itag,tuple2(unpack({0,v}),lutag)));
      // TASK: complete the definition of this rule.
   endrule: stage1_Leaf

   // If we found a pointer, a further lookup is necessary.  We also
   // enqueue the remainder of the address (Ln is the remainder after the
   // nth lookup).
   rule stage1_Pointer (unpack(sramResp.first) matches tagged Pointer (.p));
      sramResp.deq;
      let {ip,lutag,itag} = fifo.first;
      fifo.deq;
      // nr: next remainder
      let nr = begin case (ip) matches
			tagged L1 {.b,.c} :  (L2 (c));
			tagged L2 {.c}    :  (L3);
			tagged L3         :  (?);  // never happens
		     endcase end;
      let addr = ?; // TASK: complete this definition.

      fifo.enq(tuple3(nr,lutag,itag));
      sramReq.enq(addr);
   endrule: stage1_Pointer

   // Finally we define the two interfaces:
   interface Server mif;
      interface Put request;
	 method put(x);
	    action
	       ififo.enq(x);
	       // record request for debugging purposes:
	       let {ipa,lutag} = x;
	       requestB32 <= ipa;
	       requestTag <= lutag;
	    endaction
	 endmethod: put
	 // or, apart from the debugging info:
	 // interface request = fifoToPut(ififo);
      endinterface: request
      interface Get  response;
	 method get() ;
	    actionvalue
	       let resp <- completionBuffer.drain.get();
	       // record response for debugging purposes:
	       let {r,t} = resp;
	       responseB32 <= r;
	       responseTag <= t;
	       return(resp);
	    endactionvalue
	 endmethod: get
	 // or, apart from the debugging info:
	 // interface response = fifoToGet(completionBuffer.drain);
      endinterface: response
   endinterface: mif

   interface Client ram;
      interface Get request;
	 method get();
	    actionvalue
	       let req = sramReq.first;
	       sramReq.deq;
	       return (Read (req));
	    endactionvalue
	 endmethod: get
      endinterface: request
      interface Put response = fifoToPut(sramResp);
   endinterface: ram
endmodule

endpackage

/*
 TASK:
 When you have got this version to work, investigate how its performance is
 affected by the value of latency above.

 Consider, too, how the quality of the design might be affected by the precise
 details of the definition of the type IP.
*/