Added L1DCache, L1DCacheTest and some tilelink stuff.

Just compiles.

Added L1DCache, L1DCacheTest and some tilelink stuff.
Just compiles.
638167ab · Allen · 61f69ef5 · 638167ab · 638167ab · 638167ab
21 changed file
--- a/src/main/scala/bus/tilelink/Arbiter.scala
+++ b/src/main/scala/bus/tilelink/Arbiter.scala
+// See LICENSE.SiFive for license details.
+
+package bus.tilelink
+
+import chisel3._
+import chisel3.util._
+import utils.Or
+
+object TLArbiter
+{
+  // (valids, select) => readys
+  type Policy = (Int, UInt, Bool) => UInt
+
+  val lowestIndexFirst: Policy = (width, valids, select) => ~(Or.leftOR(valids) << 1)(width-1, 0)
+
+  val roundRobin: Policy = (width, valids, select) => if (width == 1) 1.U(1.W) else {
+    val valid = valids(width-1, 0)
+    assert (valid === valids)
+    val mask = RegInit(((BigInt(1) << width)-1).U(width.W))
+    val filter = Cat(valid & ~mask, valid)
+    val unready = (Or.rightOR(filter, width*2, width) >> 1) | (mask << width)
+    val readys = ~((unready >> width) & unready(width-1, 0))
+    when (select && valid.orR) {
+      mask := Or.leftOR(readys & valid, width)
+    }
+    readys(width-1, 0)
+  }
+
+  def lowestFromSeq[T <: TLChannel](sink: DecoupledIO[T], sources: Seq[DecoupledIO[T]]) {
+    apply(lowestIndexFirst)(sink, sources.map(s => (TLUtilities.numBeats1(s.bits), s)):_*)
+  }
+
+  def lowest[T <: TLChannel](sink: DecoupledIO[T], sources: DecoupledIO[T]*) {
+    apply(lowestIndexFirst)(sink, sources.toList.map(s => (TLUtilities.numBeats1(s.bits), s)):_*)
+  }
+
+  def robin[T <: TLChannel](sink: DecoupledIO[T], sources: DecoupledIO[T]*) {
+    apply(roundRobin)(sink, sources.toList.map(s => (TLUtilities.numBeats1(s.bits), s)):_*)
+  }
+
+  def apply[T <: Data](policy: Policy)(sink: DecoupledIO[T], sources: (UInt, DecoupledIO[T])*) {
+    if (sources.isEmpty) {
+      sink.valid := false.B
+    } else if (sources.size == 1) {
+      sink <> sources.head._2
+    } else {
+      val pairs = sources.toList
+      val beatsIn = pairs.map(_._1)
+      val sourcesIn = pairs.map(_._2)
+
+      // The number of beats which remain to be sent
+      val beatsLeft = RegInit(0.U)
+      val idle = beatsLeft === 0.U
+      val latch = idle && sink.ready // winner (if any) claims sink
+
+      // Who wants access to the sink?
+      val valids = sourcesIn.map(_.valid)
+      // Arbitrate amongst the requests
+      val readys = VecInit(policy(valids.size, Cat(valids.reverse), latch).asBools)
+      // Which request wins arbitration?
+      val winner = VecInit((readys zip valids) map { case (r,v) => r&&v })
+
+      // Confirm the policy works properly
+      require (readys.size == valids.size)
+      // Never two winners
+      val prefixOR = winner.scanLeft(false.B)(_||_).init
+      assert((prefixOR zip winner) map { case (p,w) => !p || !w } reduce {_ && _})
+      // If there was any request, there is a winner
+      assert (!valids.reduce(_||_) || winner.reduce(_||_))
+
+      // Track remaining beats
+      val maskedBeats = (winner zip beatsIn) map { case (w,b) => Mux(w, b, 0.U) }
+      val initBeats = maskedBeats.reduce(_ | _) // no winner => 0 beats
+      beatsLeft := Mux(latch, initBeats, beatsLeft - sink.fire())
+
+      // The one-hot source granted access in the previous cycle
+      val state = RegInit(VecInit(Seq.fill(sources.size)(false.B)))
+      val muxState = Mux(idle, winner, state)
+      state := muxState
+
+      val allowed = Mux(idle, readys, state)
+      (sourcesIn zip allowed) foreach { case (s, r) =>
+        s.ready := sink.ready && r
+      }
+      sink.valid := Mux(idle, valids.reduce(_||_), Mux1H(state, valids))
+      sink.bits := Mux1H(muxState, sourcesIn.map(_.bits))
+    }
+  }
+}
+
+/** Synthesizeable unit tests */
+/*
+import freechips.rocketchip.unittest._
+
+class TestRobin(txns: Int = 128, timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) {
+  val sources = Wire(Vec(6, DecoupledIO(UInt(width=3))))
+  val sink = Wire(DecoupledIO(UInt(width=3)))
+  val count = RegInit(0.U(8.W))
+
+  val lfsr = LFSR(16, Bool(true))
+  val valid = lfsr(0)
+  val ready = lfsr(15)
+
+  sources.zipWithIndex.map { case (z, i) => z.bits := i.U }
+  sources(0).valid := valid
+  sources(1).valid := false.B
+  sources(2).valid := valid
+  sources(3).valid := valid
+  sources(4).valid := false.B
+  sources(5).valid := valid
+  sink.ready := ready
+
+  TLArbiter(TLArbiter.roundRobin)(sink, sources.zipWithIndex.map { case (z, i) => (i.U, z) }:_*)
+  when (sink.fire()) { printf("TestRobin: %d\n", sink.bits) }
+  when (!sink.fire()) { printf("TestRobin: idle (%d %d)\n", valid, ready) }
+
+  count := count + 1.U
+  io.finished := count >= txns.U
+}
+*/
--- a/src/main/scala/bus/tilelink/Metadata.scala
+++ b/src/main/scala/bus/tilelink/Metadata.scala
+// See LICENSE.SiFive for license details.
+// See LICENSE.Berkeley for license details.
+
+package bus.tilelink
+
+import chisel3._
+import chisel3.util._
+import xiangshan.mem.MemoryOpConstants
+import utils.MuxTLookup
+
+object ClientStates {
+  val width = 2
+
+  // 这个估计是和MESI类似的一个协议？
+  // 就是不知道这个状态是tilelink指定的还是这里自己定义的
+  // 可以肯定的是：
+  // nothing的意思是invalid
+  // dirty就是dirty
+  // 然后branch和trunk，一个是shared，一个是exclusive？
+  // 然后根据下面的hasWritePermission，估计branch是shared
+  // 然后Trunk是exclusive，这样子就好理解了。
+  def Nothing = 0.U(width.W)
+  def Branch  = 1.U(width.W)
+  def Trunk   = 2.U(width.W)
+  def Dirty   = 3.U(width.W)
+
+  def hasReadPermission(state: UInt): Bool = state > Nothing
+  def hasWritePermission(state: UInt): Bool = state > Branch
+}
+
+object MemoryOpCategories extends MemoryOpConstants {
+  def wr = Cat(true.B, true.B)   // Op actually writes
+  def wi = Cat(false.B, true.B)  // Future op will write
+  def rd = Cat(false.B, false.B) // Op only reads
+
+  def categorize(cmd: UInt): UInt = {
+    val cat = Cat(isWrite(cmd), isWriteIntent(cmd))
+    //assert(cat.isOneOf(wr,wi,rd), "Could not categorize command.")
+    cat
+  }
+}
+
+/** Stores the client-side coherence information,
+  * such as permissions on the data and whether the data is dirty.
+  * Its API can be used to make TileLink messages in response to
+  * memory operations, cache control oeprations, or Probe messages.
+  */
+class ClientMetadata extends Bundle {
+  /** Actual state information stored in this bundle */
+  val state = UInt(width = ClientStates.width.W)
+
+  /** Metadata equality */
+  def ===(rhs: UInt): Bool = state === rhs
+  def ===(rhs: ClientMetadata): Bool = state === rhs.state
+  def =/=(rhs: ClientMetadata): Bool = !this.===(rhs)
+
+  /** Is the block's data present in this cache */
+  def isValid(dummy: Int = 0): Bool = state > ClientStates.Nothing
+
+  /** Determine whether this cmd misses, and the new state (on hit) or param to be sent (on miss) */
+  private def growStarter(cmd: UInt): (Bool, UInt) = {
+    import MemoryOpCategories._
+    import TLPermissions._
+    import ClientStates._
+    val c = categorize(cmd)
+    MuxTLookup(Cat(c, state), (false.B, 0.U), Seq(
+    //(effect, am now) -> (was a hit,   next)
+      Cat(rd, Dirty)   -> (true.B,  Dirty),
+      Cat(rd, Trunk)   -> (true.B,  Trunk),
+      Cat(rd, Branch)  -> (true.B,  Branch),
+      Cat(wi, Dirty)   -> (true.B,  Dirty),
+      Cat(wi, Trunk)   -> (true.B,  Trunk),
+      Cat(wr, Dirty)   -> (true.B,  Dirty),
+      Cat(wr, Trunk)   -> (true.B,  Dirty),
+    //(effect, am now) -> (was a miss,  param)
+      Cat(rd, Nothing) -> (false.B, NtoB),
+      Cat(wi, Branch)  -> (false.B, BtoT),
+      Cat(wi, Nothing) -> (false.B, NtoT),
+      Cat(wr, Branch)  -> (false.B, BtoT),
+      Cat(wr, Nothing) -> (false.B, NtoT)))
+  }
+
+  /** Determine what state to go to after miss based on Grant param
+    * For now, doesn't depend on state (which may have been Probed).
+    */
+  private def growFinisher(cmd: UInt, param: UInt): UInt = {
+    import MemoryOpCategories._
+    import TLPermissions._
+    import ClientStates._
+    val c = categorize(cmd)
+    //assert(c === rd || param === toT, "Client was expecting trunk permissions.")
+    MuxLookup(Cat(c, param), Nothing, Seq(
+    //(effect param) -> (next)
+      Cat(rd, toB)   -> Branch,
+      Cat(rd, toT)   -> Trunk,
+      Cat(wi, toT)   -> Trunk,
+      Cat(wr, toT)   -> Dirty))
+  }
+
+  /** Does this cache have permissions on this block sufficient to perform op,
+    * and what to do next (Acquire message param or updated metadata). */
+  def onAccess(cmd: UInt): (Bool, UInt, ClientMetadata) = {
+    val r = growStarter(cmd)
+    (r._1, r._2, ClientMetadata(r._2))
+  }
+
+  /** Does a secondary miss on the block require another Acquire message */
+  def onSecondaryAccess(first_cmd: UInt, second_cmd: UInt): (Bool, Bool, UInt, ClientMetadata, UInt) = {
+    import MemoryOpCategories._
+    val r1 = growStarter(first_cmd)
+    val r2 = growStarter(second_cmd)
+    val needs_second_acq = isWriteIntent(second_cmd) && !isWriteIntent(first_cmd)
+    val hit_again = r1._1 && r2._1
+    val dirties = categorize(second_cmd) === wr
+    val biggest_grow_param = Mux(dirties, r2._2, r1._2)
+    val dirtiest_state = ClientMetadata(biggest_grow_param)
+    val dirtiest_cmd = Mux(dirties, second_cmd, first_cmd)
+    (needs_second_acq, hit_again, biggest_grow_param, dirtiest_state, dirtiest_cmd)
+  }
+
+  /** Metadata change on a returned Grant */
+  def onGrant(cmd: UInt, param: UInt): ClientMetadata = ClientMetadata(growFinisher(cmd, param))
+
+  /** Determine what state to go to based on Probe param */
+  // 这个其实就是根据当前状态还有目标状态，来看具体的动作？
+  private def shrinkHelper(param: UInt): (Bool, UInt, UInt) = {
+    import ClientStates._
+    import TLPermissions._
+    MuxTLookup(Cat(param, state), (false.B, 0.U, 0.U), Seq(
+    //(wanted, am now)  -> (hasDirtyData resp, next)
+      Cat(toT, Dirty)   -> (true.B,  TtoT, Trunk),
+      Cat(toT, Trunk)   -> (false.B, TtoT, Trunk),
+      Cat(toT, Branch)  -> (false.B, BtoB, Branch),
+      Cat(toT, Nothing) -> (false.B, NtoN, Nothing),
+      Cat(toB, Dirty)   -> (true.B,  TtoB, Branch),
+      Cat(toB, Trunk)   -> (false.B, TtoB, Branch),  // Policy: Don't notify on clean downgrade
+      Cat(toB, Branch)  -> (false.B, BtoB, Branch),
+      Cat(toB, Nothing) -> (false.B, NtoN, Nothing),
+      Cat(toN, Dirty)   -> (true.B,  TtoN, Nothing),
+      Cat(toN, Trunk)   -> (false.B, TtoN, Nothing), // Policy: Don't notify on clean downgrade
+      Cat(toN, Branch)  -> (false.B, BtoN, Nothing), // Policy: Don't notify on clean downgrade
+      Cat(toN, Nothing) -> (false.B, NtoN, Nothing)))
+  }
+
+  /** Translate cache control cmds into Probe param */
+  // 在不同的cache control模式下，应该进行什么样的状态转换？
+  private def cmdToPermCap(cmd: UInt): UInt = {
+    import MemoryOpCategories._
+    import TLPermissions._
+    MuxLookup(cmd, toN, Seq(
+      M_FLUSH   -> toN,
+      M_PRODUCE -> toB,
+      M_CLEAN   -> toT))
+  }
+
+  def onCacheControl(cmd: UInt): (Bool, UInt, ClientMetadata) = {
+    val r = shrinkHelper(cmdToPermCap(cmd))
+    (r._1, r._2, ClientMetadata(r._3))
+  }
+
+  def onProbe(param: UInt): (Bool, UInt, ClientMetadata) = { 
+    val r = shrinkHelper(param)
+    (r._1, r._2, ClientMetadata(r._3))
+  }
+}
+
+/** Factories for ClientMetadata, including on reset */
+object ClientMetadata {
+  def apply(perm: UInt) = {
+    val meta = Wire(new ClientMetadata)
+    meta.state := perm
+    meta
+  }
+  def onReset = ClientMetadata(ClientStates.Nothing)
+  def maximum = ClientMetadata(ClientStates.Dirty)
+}
--- a/src/main/scala/bus/tilelink/NaiveTLToAXI4.scala
+++ b/src/main/scala/bus/tilelink/NaiveTLToAXI4.scala
+// See LICENSE.SiFive for license details.
+
+package bus.tilelink
+
+import chisel3._
+import chisel3.util._
+import bus.axi4.AXI4
+import bus.axi4.AXI4Parameters
+import utils.GTimer
+
+// a simpel TileLink to AXI4 converter
+// only support TileLink put and get
+class NaiveTLToAXI4(params: TLParameters) extends Module
+{
+  val io = IO(new Bundle{
+    val in = Flipped(new TLCached(params))
+    val out = new AXI4 
+  })
+
+  val debug = true
+
+  val in = io.in
+  val out = io.out
+
+  /* parameters */
+  val Y = true.B
+  val N = false.B
+
+  val blockSize = 64 * 8
+  val blockBytes = blockSize / 8
+  val innerBeatSize = in.d.bits.data.getWidth
+  val innerBeatBytes = innerBeatSize / 8
+  val innerDataBeats = blockSize / innerBeatSize
+  val innerBeatBits = log2Ceil(innerBeatBytes)
+  val innerBeatIndexBits = log2Ceil(innerDataBeats)
+  val innerBeatLSB = innerBeatBits
+  val innerBeatMSB = innerBeatLSB + innerBeatIndexBits - 1
+
+  val outerBeatSize = out.w.bits.data.getWidth
+  val outerBeatBytes = outerBeatSize / 8
+  val outerDataBeats = blockSize / outerBeatSize
+  val outerBeatLen = log2Ceil(outerBeatBytes)
+  val outerBurstLen = outerBeatLen + log2Ceil(outerDataBeats)
+  val addrWidth = in.a.bits.address.getWidth
+  val innerIdWidth = in.a.bits.source.getWidth
+  val outerIdWidth = out.aw.bits.id.getWidth
+
+  assert(in.a.bits.address.getWidth == out.aw.bits.addr.getWidth)
+  assert(innerBeatSize == outerBeatSize)
+
+  val split = innerBeatSize / outerBeatSize
+  val splitBits = log2Ceil(split)
+  require(isPow2(split))
+
+  val s_idle :: s_gather_write_data :: s_wait_awready :: s_mem_write :: s_wait_bresp :: s_wait_arready :: s_mem_read :: s_read_resp :: s_write_resp :: Nil = Enum(9)
+
+  val state = RegInit(s_idle)
+  val timer = GTimer()
+  val log_prefix = "cycle: %d [L2Cache] state %x "
+  def log_raw(prefix: String, fmt: String, tail: String, args: Bits*) = {
+    if (debug) {
+      printf(prefix + fmt + tail, args:_*)
+    }
+  }
+
+  /** Single log */
+  def log(fmt: String, args: Bits*) = log_raw(log_prefix, fmt, "\n", timer +: state +: args:_*)
+  /** Log with line continued */
+  def log_part(fmt: String, args: Bits*) = log_raw(log_prefix, fmt, "", timer +: state +: args:_*)
+  /** Log with nothing added */
+  def log_plain(fmt: String, args: Bits*) = log_raw("", fmt, "", args:_*)
+
+  when (in.a.fire()) {
+    log("in.a opcode %x, dsid %x, param %x, size %x, source %x, address %x, mask %x, data %x",
+      in.a.bits.opcode,
+      in.a.bits.param,
+      in.a.bits.size,
+      in.a.bits.source,
+      in.a.bits.address,
+      in.a.bits.mask,
+      in.a.bits.data)
+  }
+
+  /*
+  when (out.a.fire()) {
+    log("out.a.opcode %x, dsid %x, param %x, size %x, source %x, address %x, mask %x, data %x",
+      out.a.bits.opcode,
+      out.a.bits.dsid,
+      out.a.bits.param,
+      out.a.bits.size,
+      out.a.bits.source,
+      out.a.bits.address,
+      out.a.bits.mask,
+      out.a.bits.data)
+  }
+  */
+
+  val in_opcode = in.a.bits.opcode
+  val in_addr = in.a.bits.address
+  val in_id   = in.a.bits.source
+  val in_len_shift = in.a.bits.size >= innerBeatBits.U
+  val in_len  = Mux(in_len_shift, ((1.U << in.a.bits.size) >> innerBeatBits) - 1.U, 0.U)  // #word, i.e., arlen in AXI
+  val in_data = in.a.bits.data
+  val in_data_mask = in.a.bits.mask
+
+  val in_recv_fire = in.a.fire()
+  val in_read_req = in_recv_fire && (in_opcode === TLMessages.Get)
+  val in_write_req = in_recv_fire && (in_opcode === TLMessages.PutFullData)
+
+  val addr = Reg(UInt(addrWidth.W))
+  val id = Reg(UInt(innerIdWidth.W))
+  val opcode = Reg(UInt(3.W))
+  val size_reg = Reg(UInt(in.a.bits.size.getWidth.W))
+  
+  val ren = RegInit(N)
+  val wen = RegInit(N)
+
+  val start_beat = in_addr(innerBeatMSB, innerBeatLSB)
+  val inner_end_beat_reg = Reg(UInt(4.W))
+  val inner_end_beat = Mux(state === s_idle, start_beat + in_len, inner_end_beat_reg)
+
+  // gather write data beat count
+  val gather_curr_beat_reg = RegInit(0.asUInt(log2Ceil(innerDataBeats).W))
+  val gather_curr_beat = Mux(state === s_idle, start_beat, gather_curr_beat_reg)
+  val gather_last_beat = gather_curr_beat === inner_end_beat
+
+  // read response beat count
+  val resp_curr_beat = RegInit(0.asUInt(log2Ceil(innerDataBeats).W))
+  val resp_last_beat = resp_curr_beat === inner_end_beat
+
+  // state transitions:
+  // s_idle: idle state
+  // capture requests
+  // --------------------------------------------------------------------------------
+  when (state === s_idle) {
+    when (in_read_req) {
+      ren := Y
+      wen := N
+
+      addr := in_addr
+      id := in_id
+      opcode := in_opcode
+      size_reg := in.a.bits.size
+
+      resp_curr_beat := start_beat
+      inner_end_beat_reg := start_beat + in_len
+
+      state := s_wait_arready
+    } .elsewhen (in_write_req) {
+      ren := N
+      wen := Y
+      addr := in_addr
+      id := in_id
+      opcode := in_opcode
+      size_reg := in.a.bits.size
+
+      resp_curr_beat := start_beat
+      inner_end_beat_reg := start_beat + in_len
+
+      state := s_gather_write_data
+    } .elsewhen (in.b.fire() || in.c.fire() || in.e.fire()) {
+      assert(N, "Inner tilelink Unexpected handshake")
+    }
+  }
+
+
+  // s_gather_write_data:
+  // gather write data
+  // --------------------------------------------------------------------------------
+  val data_buf = Reg(Vec(outerDataBeats, UInt(outerBeatSize.W)))
+  val data_mask = Reg(Vec(outerDataBeats, UInt(outerBeatBytes.W)))
+
+  // tilelink receives the first data beat when address handshake
+  // which is different from axi
+  val first_data_beat = state === s_idle && in_write_req
+  val following_data_beat = state === s_gather_write_data && in_recv_fire
+  val gather_data_beat = first_data_beat || following_data_beat
+
+  when (first_data_beat) {
+    gather_curr_beat_reg := start_beat + 1.U
+  }
+  
+  when (following_data_beat) {
+    gather_curr_beat_reg := gather_curr_beat_reg + 1.U
+  }
+
+  when (first_data_beat || following_data_beat) {
+    for (i <- 0 until split) {
+      data_buf((gather_curr_beat << splitBits) + i.U) := in_data(outerBeatSize * (i + 1) - 1, outerBeatSize * i)
+      data_mask((gather_curr_beat << splitBits) + i.U) := in_data_mask(outerBeatBytes * (i + 1) - 1, outerBeatBytes * i)
+    }
+    when (gather_last_beat) {
+      state := s_write_resp
+    }
+  }
+
+  when (state === s_write_resp && in.d.fire()) {
+    state := s_wait_awready
+  }
+
+  // s_wait_arready, s_mem_read, s_read_resp
+  // deal with read
+  // --------------------------------------------------------------------------------
+  when (state === s_wait_arready && out.ar.fire()) {
+    state := s_mem_read
+  }
+
+  val out_rdata_fire = out.r.fire()
+  val (refill_cnt, refill_done) = Counter(out_rdata_fire && state === s_mem_read, outerDataBeats)
+  when (state === s_mem_read && out_rdata_fire) {
+    data_buf(refill_cnt) := out.r.bits.data
+    when (refill_done) {
+      state := s_read_resp
+    }
+  }
+
+  when (state === s_read_resp && in.d.fire()) {
+    resp_curr_beat := resp_curr_beat + 1.U
+    when (resp_last_beat) {
+      state := s_idle
+    }
+  }
+
+  val resp_data = Wire(Vec(split, UInt(outerBeatSize.W)))
+  for (i <- 0 until split) {
+    resp_data(i) := data_buf((resp_curr_beat << splitBits) + i.U)
+  }
+  
+
+  // deal with write
+  // s_wait_awready & s_mem_write
+  // --------------------------------------------------------------------------------
+  when (state === s_wait_awready && out.aw.fire()) {
+    state := s_mem_write
+  }
+
+  val (wb_cnt, wb_done) = Counter(out.w.fire() && state === s_mem_write, outerDataBeats)
+  when (state === s_mem_write && wb_done) {
+    state := s_wait_bresp
+  }
+
+  when (state === s_wait_bresp && out.b.fire()) {
+    state := s_idle
+  }
+
+  // IO ports
+  // Input tilelink channels
+  // --------------------------------------------------------------------------------
+
+  // channel A
+  in.a.ready := state === s_idle || state === s_gather_write_data
+
+  // channel B
+  in.b.valid := N
+
+  // channel C
+  in.c.ready := N
+
+  // channel D
+  val in_read_resp = state === s_read_resp
+  val in_write_resp = state === s_write_resp
+  in.d.valid := in_write_resp || in_read_resp
+  in.d.bits.opcode  := Mux(in_read_resp, TLMessages.AccessAckData, TLMessages.AccessAck)
+  in.d.bits.param   := 0.U
+  in.d.bits.size    := size_reg
+  in.d.bits.source  := id
+  in.d.bits.sink    := 0.U
+  in.d.bits.denied  := N
+  in.d.bits.data    := resp_data(resp_curr_beat)
+  in.d.bits.corrupt := N
+
+  // channel E
+  in.e.ready := N
+
+  // Output AXI4 channels
+  // --------------------------------------------------------------------------------
+  val axi4_size = log2Up(outerBeatBytes).U
+
+  // AW channel
+  // write address channel signals
+  out.aw.valid := state === s_wait_awready
+  out.aw.bits.id := 0.U
+  out.aw.bits.addr := addr
+  out.aw.bits.len := (outerDataBeats - 1).asUInt(8.W)
+  out.aw.bits.size := axi4_size
+  out.aw.bits.burst := AXI4Parameters.BURST_INCR       // normal sequential memory
+  out.aw.bits.lock := 0.asUInt(1.W)
+  out.aw.bits.cache := AXI4Parameters.CACHE_RALLOCATE | AXI4Parameters.CACHE_WALLOCATE | AXI4Parameters.CACHE_MODIFIABLE | AXI4Parameters.CACHE_BUFFERABLE
+  out.aw.bits.prot := 0.asUInt(3.W)
+  out.aw.bits.qos := 0.asUInt(4.W)
+
+  // W channel
+  // write data channel signals
+  out.w.valid := state === s_mem_write
+  out.w.bits.data := data_buf(wb_cnt)
+  out.w.bits.strb := Fill(outerBeatBytes, 1.asUInt(1.W))
+  out.w.bits.last := wb_cnt === (outerDataBeats - 1).U
+
+  // B channel
+  // write response channel signals
+  out.b.ready := state === s_wait_bresp
+
+  // AR channel
+  // read address channel signals
+  out.ar.valid := state === s_wait_arready
+  out.ar.bits.id := 0.asUInt(outerIdWidth.W)
+  out.ar.bits.addr := addr
+  out.ar.bits.len := (outerDataBeats - 1).asUInt(8.W)
+  out.ar.bits.size := axi4_size
+  out.ar.bits.burst := AXI4Parameters.BURST_INCR
+  out.ar.bits.lock := 0.asUInt(1.W)
+  out.ar.bits.cache := AXI4Parameters.CACHE_RALLOCATE | AXI4Parameters.CACHE_WALLOCATE | AXI4Parameters.CACHE_MODIFIABLE | AXI4Parameters.CACHE_BUFFERABLE
+  out.ar.bits.prot := 0.asUInt(3.W)
+  out.ar.bits.qos := 0.asUInt(4.W)
+
+  // R channel
+  // read data channel signals
+  out.r.ready := state === s_mem_read
+}
+
+
+object NaiveTLToAXI4
+{
+  def apply(params: TLParameters) = { new NaiveTLToAXI4(params) }
+}
--- a/src/main/scala/bus/tilelink/TLUtilities.scala
+++ b/src/main/scala/bus/tilelink/TLUtilities.scala
--- a/src/main/scala/bus/tilelink/TileLink.scala
+++ b/src/main/scala/bus/tilelink/TileLink.scala
+// See LICENSE.SiFive for license details.
+
+package bus.tilelink
+
+import chisel3._
+import chisel3.util._
+
+import xiangshan.HasXSParameter
+
+case class TLParameters(
+  addressBits: Int = 64,
+  dataBits: Int = 64,
+  sourceBits: Int = 1,
+  sinkBits: Int = 1,
+  sizeBits: Int = 3,
+  maxTransfer: Int = 64) {
+    def beatBytes = dataBits / 8
+    val maxLgSize = log2Ceil(maxTransfer)
+  }
+
+
+object TLMessages 
+{
+  // opcode width
+  val width = 3 
+  //                                  A    B    C    D    E
+  def PutFullData    = 0.U(width.W) //     .    .                   => AccessAck
+  def PutPartialData = 1.U(width.W) //     .    .                   => AccessAck
+  def ArithmeticData = 2.U(width.W) //     .    .                   => AccessAckData
+  def LogicalData    = 3.U(width.W) //     .    .                   => AccessAckData
+  def Get            = 4.U(width.W) //     .    .                   => AccessAckData
+  def Hint           = 5.U(width.W) //     .    .                   => HintAck
+  def AcquireBlock   = 6.U(width.W) //     .                        => Grant[Data]
+  def AcquirePerm    = 7.U(width.W) //     .                        => Grant[Data]
+  def Probe          = 6.U(width.W) //          .                   => ProbeAck[Data]
+  def AccessAck      = 0.U(width.W) //               .    .
+  def AccessAckData  = 1.U(width.W) //               .    .
+  def HintAck        = 2.U(width.W) //               .    .
+  def ProbeAck       = 4.U(width.W) //               .
+  def ProbeAckData   = 5.U(width.W) //               .
+  def Release        = 6.U(width.W) //               .              => ReleaseAck
+  def ReleaseData    = 7.U(width.W) //               .              => ReleaseAck
+  def Grant          = 4.U(width.W) //                    .         => GrantAck
+  def GrantData      = 5.U(width.W) //                    .         => GrantAck
+  def ReleaseAck     = 6.U(width.W) //                    .
+  def GrantAck       = 0.U(width.W) //                         .
+ 
+  def isA(x: UInt) = x <= AcquirePerm
+  def isB(x: UInt) = x <= Probe
+  def isC(x: UInt) = x <= ReleaseData
+  def isD(x: UInt) = x <= ReleaseAck
+
+  def adResponse = VecInit(AccessAck, AccessAck, AccessAckData, AccessAckData, AccessAckData, HintAck, Grant, Grant)
+  def bcResponse = VecInit(AccessAck, AccessAck, AccessAckData, AccessAckData, AccessAckData, HintAck, ProbeAck, ProbeAck)
+  
+  def a = Seq( ("PutFullData",TLPermissions.PermMsgReserved),
+               ("PutPartialData",TLPermissions.PermMsgReserved),
+               ("ArithmeticData",TLAtomics.ArithMsg),
+               ("LogicalData",TLAtomics.LogicMsg),
+               ("Get",TLPermissions.PermMsgReserved),
+               ("Hint",TLHints.HintsMsg),
+               ("AcquireBlock",TLPermissions.PermMsgGrow),
+               ("AcquirePerm",TLPermissions.PermMsgGrow))
+
+  def b = Seq( ("PutFullData",TLPermissions.PermMsgReserved),
+               ("PutPartialData",TLPermissions.PermMsgReserved),
+               ("ArithmeticData",TLAtomics.ArithMsg),
+               ("LogicalData",TLAtomics.LogicMsg),
+               ("Get",TLPermissions.PermMsgReserved),
+               ("Hint",TLHints.HintsMsg),
+               ("Probe",TLPermissions.PermMsgCap))
+
+  def c = Seq( ("AccessAck",TLPermissions.PermMsgReserved),
+               ("AccessAckData",TLPermissions.PermMsgReserved),
+               ("HintAck",TLPermissions.PermMsgReserved),
+               ("Invalid Opcode",TLPermissions.PermMsgReserved),
+               ("ProbeAck",TLPermissions.PermMsgReport),
+               ("ProbeAckData",TLPermissions.PermMsgReport),
+               ("Release",TLPermissions.PermMsgReport),
+               ("ReleaseData",TLPermissions.PermMsgReport))
+
+  def d = Seq( ("AccessAck",TLPermissions.PermMsgReserved),
+               ("AccessAckData",TLPermissions.PermMsgReserved),
+               ("HintAck",TLPermissions.PermMsgReserved),
+               ("Invalid Opcode",TLPermissions.PermMsgReserved),
+               ("Grant",TLPermissions.PermMsgCap),
+               ("GrantData",TLPermissions.PermMsgCap),
+               ("ReleaseAck",TLPermissions.PermMsgReserved))
+
+}
+
+/**
+  * The three primary TileLink permissions are:
+  *   (T)runk: the agent is (or is on inwards path to) the global point of serialization.
+  *   (B)ranch: the agent is on an outwards path to
+  *   (N)one: 
+  * These permissions are permuted by transfer operations in various ways.
+  * Operations can cap permissions, request for them to be grown or shrunk,
+  * or for a report on their current status.
+  */
+object TLPermissions
+{
+  val aWidth = 2
+  val bdWidth = 2
+  val cWidth = 3
+
+  // Cap types (Grant = new permissions, Probe = permisions <= target)
+  def toT = 0.U(bdWidth)
+  def toB = 1.U(bdWidth)
+  def toN = 2.U(bdWidth)
+  def isCap(x: UInt) = x <= toN
+
+  // Grow types (Acquire = permissions >= target)
+  def NtoB = 0.U(aWidth)
+  def NtoT = 1.U(aWidth)
+  def BtoT = 2.U(aWidth)
+  def isGrow(x: UInt) = x <= BtoT
+
+  // Shrink types (ProbeAck, Release)
+  def TtoB = 0.U(cWidth)
+  def TtoN = 1.U(cWidth)
+  def BtoN = 2.U(cWidth)
+  def isShrink(x: UInt) = x <= BtoN
+
+  // Report types (ProbeAck, Release)
+  def TtoT = 3.U(cWidth)
+  def BtoB = 4.U(cWidth)
+  def NtoN = 5.U(cWidth)
+  def isReport(x: UInt) = x <= NtoN
+
+  def PermMsgGrow:Seq[String] = Seq("Grow NtoB", "Grow NtoT", "Grow BtoT")
+  def PermMsgCap:Seq[String] = Seq("Cap toT", "Cap toB", "Cap toN")
+  def PermMsgReport:Seq[String] = Seq("Shrink TtoB", "Shrink TtoN", "Shrink BtoN", "Report TotT", "Report BtoB", "Report NtoN")
+  def PermMsgReserved:Seq[String] = Seq("Reserved") 
+}
+
+object TLAtomics
+{
+  val width = 3 
+
+  // Arithmetic types
+  def MIN  = 0.U(width)
+  def MAX  = 1.U(width)
+  def MINU = 2.U(width)
+  def MAXU = 3.U(width)
+  def ADD  = 4.U(width)
+  def isArithmetic(x: UInt) = x <= ADD
+
+  // Logical types
+  def XOR  = 0.U(width)
+  def OR   = 1.U(width)
+  def AND  = 2.U(width)
+  def SWAP = 3.U(width)
+  def isLogical(x: UInt) = x <= SWAP
+
+  def ArithMsg:Seq[String] = Seq("MIN", "MAX", "MINU", "MAXU", "ADD")
+  def LogicMsg:Seq[String] = Seq("XOR", "OR", "AND", "SWAP")
+}
+ 
+
+object TLHints
+{
+  val width = 1
+
+  def PREFETCH_READ  = 0.U(width)
+  def PREFETCH_WRITE = 1.U(width)
+  def isHints(x: UInt) = x <= PREFETCH_WRITE
+
+  def HintsMsg:Seq[String] = Seq("PrefetchRead", "PrefetchWrite")
+}
+
+sealed trait TLChannel extends Bundle {
+  val channelName: String
+  val params: TLParameters
+}
+
+sealed trait TLDataChannel extends TLChannel
+sealed trait TLAddrChannel extends TLDataChannel
+
+class TLBundleA(override val params: TLParameters) extends TLAddrChannel
+{
+  val channelName = "'A' channel"
+  val opcode  = UInt(3.W)
+  val param   = UInt(3.W)
+  val size    = UInt(params.sizeBits.W)
+  val source  = UInt(params.sourceBits.W)
+  val address = UInt(params.addressBits.W)
+  val mask    = UInt((params.dataBits/8).W)
+  val data    = UInt(params.dataBits.W)
+  val corrupt = Bool()
+}
+
+class TLBundleB(override val params: TLParameters) extends TLAddrChannel
+{
+  val channelName = "'B' channel"
+  val opcode  = UInt(3.W)
+  val param   = UInt(3.W)
+  val size    = UInt(params.sizeBits.W)
+  val source  = UInt(params.sourceBits.W)
+  val address = UInt(params.addressBits.W)
+  val mask    = UInt((params.dataBits/8).W)
+  val data    = UInt(params.dataBits.W)
+  val corrupt = Bool()
+}
+
+class TLBundleC(override val params: TLParameters) extends TLAddrChannel
+{
+  val channelName = "'C' channel"
+  val opcode  = UInt(3.W)
+  val param   = UInt(3.W)
+  val size    = UInt(params.sizeBits.W)
+  val source  = UInt(params.sourceBits.W)
+  val address = UInt(params.addressBits.W)
+  val data    = UInt(params.dataBits.W)
+  val corrupt = Bool()
+}
+
+class TLBundleD(override val params: TLParameters) extends TLDataChannel
+{
+  val channelName = "'D' channel"
+  val opcode  = UInt(3.W)
+  val param   = UInt(2.W)
+  val size    = UInt(params.sizeBits.W)
+  val source  = UInt(params.sourceBits.W)
+  val sink    = UInt(params.sinkBits.W)
+  val denied  = Bool()
+  val data    = UInt(params.dataBits.W)
+  val corrupt = Bool()
+}
+
+class TLBundleE(override val params: TLParameters) extends TLChannel
+{
+  val channelName = "'E' channel"
+  val sink    = UInt(params.sinkBits.W)
+}
+
+// TL-UL and TL-UC
+class TLUnCached(val params: TLParameters) extends Bundle {
+  val a = Decoupled(new TLBundleA(params))
+  val d = Flipped(Decoupled(new TLBundleD(params)))
+}
+
+// TL-C
+class TLCached(override val params: TLParameters) extends TLUnCached(params) {
+  val b = Flipped(Decoupled(new TLBundleB(params)))
+  val c = Decoupled(new TLBundleC(params))
+  val e = Decoupled(new TLBundleE(params))
+}
+
+object TLUnCached
+{
+  def apply(params: TLParameters) = new TLUnCached(params)
+}
+
+object TLCached
+{
+  def apply(params: TLParameters) = new TLCached(params)
+}
--- a/src/main/scala/utils/BitUtils.scala
+++ b/src/main/scala/utils/BitUtils.scala
@@ -2,6 +2,7 @@ package utils

 import chisel3._
 import chisel3.util._
+import scala.math.min

 object WordShift {
  def apply(data: UInt, wordIndex: UInt, step: Int) = (data << (wordIndex * step.U))
@@ -31,3 +32,23 @@ object ZeroExt {
    if (aLen == len) a else Cat(0.U((len - aLen).W), a)
  }
 }
+
+object Or {
+  // Fill 1s from low bits to high bits
+  def leftOR(x: UInt): UInt = leftOR(x, x.getWidth, x.getWidth)
+  def leftOR(x: UInt, width: Integer, cap: Integer = 999999): UInt = {
+    val stop = min(width, cap)
+    def helper(s: Int, x: UInt): UInt =
+      if (s >= stop) x else helper(s+s, x | (x << s)(width-1,0))
+    helper(1, x)(width-1, 0)
+  }
+
+  // Fill 1s form high bits to low bits
+  def rightOR(x: UInt): UInt = rightOR(x, x.getWidth, x.getWidth)
+  def rightOR(x: UInt, width: Integer, cap: Integer = 999999): UInt = {
+    val stop = min(width, cap)
+    def helper(s: Int, x: UInt): UInt =
+      if (s >= stop) x else helper(s+s, x | (x >> s))
+    helper(1, x)(width-1, 0)
+  }
+}
--- a/src/main/scala/utils/ECC.scala
+++ b/src/main/scala/utils/ECC.scala
+// See LICENSE.Berkeley for license details.
+
+package utils
+
+import chisel3._
+import chisel3.util._
+import chisel3.util.random.LFSR
+
+abstract class Decoding
+{
+  def uncorrected: UInt
+  def corrected: UInt
+  def correctable: Bool
+  def uncorrectable: Bool // If true, correctable should be ignored
+  def error = correctable || uncorrectable
+}
+
+abstract class Code
+{
+  def canDetect: Boolean
+  def canCorrect: Boolean
+
+  def width(w0: Int): Int
+
+  /** Encode x to a codeword suitable for decode.
+   *  If poison is true, the decoded value will report uncorrectable
+   *  error despite uncorrected == corrected == x.
+   */
+  def encode(x: UInt, poison: Bool = false.B): UInt
+  def decode(x: UInt): Decoding
+
+  /** Copy the bits in x to the right bit positions in an encoded word,
+   *  so that x === decode(swizzle(x)).uncorrected; but don't generate
+   *  the other code bits, so decode(swizzle(x)).error might be true.
+   *  For codes for which this operation is not trivial, throw an
+   *  UnsupportedOperationException.  */
+  def swizzle(x: UInt): UInt
+}
+
+class IdentityCode extends Code
+{
+  def canDetect = false
+  def canCorrect = false
+
+  def width(w0: Int) = w0
+  def encode(x: UInt, poison: Bool = false.B) = {
+    require (poison.isLit && poison.litValue == 0, "IdentityCode can not be poisoned")
+    x
+  }
+  def swizzle(x: UInt) = x
+  def decode(y: UInt) = new Decoding {
+    def uncorrected = y
+    def corrected = y
+    def correctable = false.B
+    def uncorrectable = false.B
+  }
+}
+
+class ParityCode extends Code
+{
+  def canDetect = true
+  def canCorrect = false
+
+  def width(w0: Int) = w0+1
+  def encode(x: UInt, poison: Bool = false.B) = Cat(x.xorR ^ poison, x)
+  def swizzle(x: UInt) = Cat(false.B, x)
+  def decode(y: UInt) = new Decoding {
+    val uncorrected = y(y.getWidth-2,0)
+    val corrected = uncorrected
+    val correctable = false.B
+    val uncorrectable = y.xorR
+  }
+}
+
+class SECCode extends Code
+{
+  def canDetect = true
+  def canCorrect = true
+
+  // SEC codes may or may not be poisonous depending on the length
+  // If the code is perfect, every non-codeword is correctable
+  def poisonous(n: Int) = !isPow2(n+1)
+
+  def width(k: Int) = {
+    val m = log2Floor(k) + 1
+    k + m + (if((1 << m) < m+k+1) 1 else 0)
+  }
+  def swizzle(x: UInt) = {
+    val k = x.getWidth
+    val n = width(k)
+    Cat(0.U((n-k).W), x)
+  }
+
+  // An (n=16, k=11) Hamming code is naturally encoded as:
+  //   PPxPxxxPxxxxxxxP where P are parity bits and x are data
+  //   Indexes typically start at 1, because then the P are on powers of two
+  // In systematic coding, you put all the data in the front:
+  //   xxxxxxxxxxxPPPPP
+  //   Indexes typically start at 0, because Computer Science
+  // For sanity when reading SRAMs, you want systematic form.
+
+  private def impl(n: Int, k: Int) = {
+    require (n >= 3 && k >= 1 && !isPow2(n))
+    val hamm2sys = IndexedSeq.tabulate(n+1) { i =>
+      if (i == 0) {
+        n /* undefined */
+      } else if (isPow2(i)) {
+        k + log2Ceil(i)
+      } else {
+        i - 1 - log2Ceil(i)
+      }
+    }
+    val sys2hamm = hamm2sys.zipWithIndex.sortBy(_._1).map(_._2).toIndexedSeq
+    def syndrome(j: Int) = {
+      val bit = 1 << j
+      ("b" + Seq.tabulate(n) { i =>
+        if ((sys2hamm(i) & bit) != 0) "1" else "0"
+      }.reverse.mkString).U
+    }
+    (hamm2sys, sys2hamm, syndrome _)
+  }
+
+  def encode(x: UInt, poison: Bool = false.B) = {
+    val k = x.getWidth
+    val n = width(k)
+    val (_, _, syndrome) = impl(n, k)
+
+    require ((poison.isLit && poison.litValue == 0) || poisonous(n), s"SEC code of length ${n} cannot be poisoned")
+
+    /* By setting the entire syndrome on poison, the corrected bit falls off the end of the code */
+    val syndromeUInt = VecInit.tabulate(n-k) { j => (syndrome(j)(k-1, 0) & x).xorR ^ poison }.asUInt
+    Cat(syndromeUInt, x)
+  }
+
+  def decode(y: UInt) = new Decoding {
+    val n = y.getWidth
+    val k = n - log2Ceil(n)
+    val (_, sys2hamm, syndrome) = impl(n, k)
+
+    val syndromeUInt = VecInit.tabulate(n-k) { j => (syndrome(j) & y).xorR }.asUInt
+
+    val hammBadBitOH = UIntToOH(syndromeUInt, n+1)
+    val sysBadBitOH = VecInit.tabulate(k) { i => hammBadBitOH(sys2hamm(i)) }.asUInt
+
+    val uncorrected = y(k-1, 0)
+    val corrected = uncorrected ^ sysBadBitOH
+    val correctable = syndromeUInt.orR
+    val uncorrectable = if (poisonous(n)) { syndromeUInt > n.U } else { false.B }
+  }
+}
+
+class SECDEDCode extends Code
+{
+  def canDetect = true
+  def canCorrect = true
+
+  private val sec = new SECCode
+  private val par = new ParityCode
+
+  def width(k: Int) = sec.width(k)+1
+  def encode(x: UInt, poison: Bool = false.B) = {
+    // toggling two bits ensures the error is uncorrectable
+    // to ensure corrected == uncorrected, we pick one redundant
+    // bit from SEC (the highest); correcting it does not affect
+    // corrected == uncorrected. the second toggled bit is the
+    // parity bit, which also does not appear in the decoding
+    val toggle_lo = Cat(poison.asUInt, poison.asUInt)
+    val toggle_hi = toggle_lo << (sec.width(x.getWidth)-1)
+    par.encode(sec.encode(x)) ^ toggle_hi
+  }
+  def swizzle(x: UInt) = par.swizzle(sec.swizzle(x))
+  def decode(x: UInt) = new Decoding {
+    val secdec = sec.decode(x(x.getWidth-2,0))
+    val pardec = par.decode(x)
+
+    val uncorrected = secdec.uncorrected
+    val corrected = secdec.corrected
+    val correctable = pardec.uncorrectable
+    val uncorrectable = !pardec.uncorrectable && secdec.correctable
+  }
+}
+
+object ErrGen
+{
+  // generate a 1-bit error with approximate probability 2^-f
+  def apply(width: Int, f: Int): UInt = {
+    require(width > 0 && f >= 0 && log2Up(width) + f <= 16)
+    UIntToOH(LFSR(16)(log2Up(width)+f-1,0))(width-1,0)
+  }
+  def apply(x: UInt, f: Int): UInt = x ^ apply(x.getWidth, f)
+}
+
+trait CanHaveErrors extends Bundle {
+  val correctable: Option[ValidIO[UInt]]
+  val uncorrectable: Option[ValidIO[UInt]]
+}
+
+case class ECCParams(
+  bytes: Int = 1,
+  code: Code = new IdentityCode,
+  notifyErrors: Boolean = false
+)
+
+object Code {
+  def fromString(s: Option[String]): Code = fromString(s.getOrElse("none"))
+  def fromString(s: String): Code = s.toLowerCase match {
+    case "none" => new IdentityCode
+    case "identity" => new IdentityCode
+    case "parity" => new ParityCode
+    case "sec" => new SECCode
+    case "secded" => new SECDEDCode
+    case _ => throw new IllegalArgumentException("Unknown ECC type")
+  }
+}
--- a/src/main/scala/utils/LookupTree.scala
+++ b/src/main/scala/utils/LookupTree.scala
@@ -12,3 +12,32 @@ object LookupTreeDefault {
  def apply[T <: Data](key: UInt, default: T, mapping: Iterable[(UInt, T)]): T =
    MuxLookup(key, default, mapping.toSeq)
 }
+
+
+object MuxT {
+  def apply[T <: Data, U <: Data](cond: Bool, con: (T, U), alt: (T, U)): (T, U) =
+    (Mux(cond, con._1, alt._1), Mux(cond, con._2, alt._2))
+
+  def apply[T <: Data, U <: Data, W <: Data](cond: Bool, con: (T, U, W), alt: (T, U, W)): (T, U, W) =
+    (Mux(cond, con._1, alt._1), Mux(cond, con._2, alt._2), Mux(cond, con._3, alt._3))
+
+  def apply[T <: Data, U <: Data, W <: Data, X <: Data](cond: Bool, con: (T, U, W, X), alt: (T, U, W, X)): (T, U, W, X) =
+    (Mux(cond, con._1, alt._1), Mux(cond, con._2, alt._2), Mux(cond, con._3, alt._3), Mux(cond, con._4, alt._4))
+}
+
+/** Creates a cascade of n MuxTs to search for a key value. */
+object MuxTLookup {
+  def apply[S <: UInt, T <: Data, U <: Data](key: S, default: (T, U), mapping: Seq[(S, (T, U))]): (T, U) = {
+    var res = default
+    for ((k, v) <- mapping.reverse)
+      res = MuxT(k === key, v, res)
+    res
+  }
+
+  def apply[S <: UInt, T <: Data, U <: Data, W <: Data](key: S, default: (T, U, W), mapping: Seq[(S, (T, U, W))]): (T, U, W) = {
+    var res = default
+    for ((k, v) <- mapping.reverse)
+      res = MuxT(k === key, v, res)
+    res
+  }
+}
--- a/src/main/scala/utils/Misc.scala
+++ b/src/main/scala/utils/Misc.scala
+package utils
+
+import chisel3._
+import chisel3.util._
+
+// This gets used everywhere, so make the smallest circuit possible ...
+// Given an address and size, create a mask of beatBytes size
+// eg: (0x3, 0, 4) => 0001, (0x3, 1, 4) => 0011, (0x3, 2, 4) => 1111
+// groupBy applies an interleaved OR reduction; groupBy=2 take 0010 => 01
+object MaskGen {
+  def apply(addr_lo: UInt, lgSize: UInt, beatBytes: Int, groupBy: Int = 1): UInt = {
+    require (groupBy >= 1 && beatBytes >= groupBy)
+    require (isPow2(beatBytes) && isPow2(groupBy))
+    val lgBytes = log2Ceil(beatBytes)
+    val sizeOH = UIntToOH(lgSize | 0.U(log2Up(beatBytes).W), log2Up(beatBytes)) | (groupBy*2 - 1).U
+
+    def helper(i: Int): Seq[(Bool, Bool)] = {
+      if (i == 0) {
+        Seq((lgSize >= lgBytes.U, true.B))
+      } else {
+        val sub = helper(i-1)
+        val size = sizeOH(lgBytes - i)
+        val bit = addr_lo(lgBytes - i)
+        val nbit = !bit
+        Seq.tabulate (1 << i) { j =>
+          val (sub_acc, sub_eq) = sub(j/2)
+          val eq = sub_eq && (if (j % 2 == 1) bit else nbit)
+          val acc = sub_acc || (size && eq)
+          (acc, eq)
+        }
+      }
+    }
+
+    if (groupBy == beatBytes) 1.U else
+      Cat(helper(lgBytes-log2Ceil(groupBy)).map(_._1).reverse)
+  }
+}
+
+object Random
+{
+  def apply(mod: Int, random: UInt): UInt = {
+    if (isPow2(mod)) random(log2Ceil(mod)-1,0)
+    else PriorityEncoder(partition(apply(1 << log2Up(mod*8), random), mod))
+  }
+  def apply(mod: Int): UInt = apply(mod, randomizer)
+  def oneHot(mod: Int, random: UInt): UInt = {
+    if (isPow2(mod)) UIntToOH(random(log2Up(mod)-1,0))
+    else VecInit(PriorityEncoderOH(partition(apply(1 << log2Up(mod*8), random), mod))).asUInt
+  }
+  def oneHot(mod: Int): UInt = oneHot(mod, randomizer)
+
+  private def randomizer = LFSR16()
+  private def partition(value: UInt, slices: Int) =
+    Seq.tabulate(slices)(i => value < (((i + 1) << value.getWidth) / slices).U)
+}
+
+
+/**
+ * Transpose a matrix of Chisel Vecs.
+ */
+object Transpose
+{
+  def apply[T <: chisel3.core.Data](in: Vec[Vec[T]]) = {
+    val n = in(0).size
+    VecInit((0 until n).map(i => VecInit(in.map(row => row(i)))))
+  }
+}
--- a/src/main/scala/utils/Replacement.scala
+++ b/src/main/scala/utils/Replacement.scala
+// See LICENSE.Berkeley for license details.
+// See LICENSE.SiFive for license details.
+
+package utils
+
+import chisel3._
+import chisel3.util._
+import chisel3.util.random.LFSR
+
+abstract class ReplacementPolicy {
+  def way: UInt
+  def miss: Unit
+  def hit: Unit
+}
+
+class RandomReplacement(ways: Int) extends ReplacementPolicy {
+  private val replace = Wire(Bool())
+  replace := false.B
+  val lfsr = LFSR(16, replace)
+
+  def way = Random(ways, lfsr)
+  def miss = replace := true.B
+  def hit = {}
+}
+
+abstract class SeqReplacementPolicy {
+  def access(set: UInt): Unit
+  def update(valid: Bool, hit: Bool, set: UInt, way: UInt): Unit
+  def way: UInt
+}
+
+class SeqRandom(n_ways: Int) extends SeqReplacementPolicy {
+  val logic = new RandomReplacement(n_ways)
+  def access(set: UInt) = { }
+  def update(valid: Bool, hit: Bool, set: UInt, way: UInt) = {
+    when (valid && !hit) { logic.miss }
+  }
+  def way = logic.way
+}
+
+class PseudoLRU(n: Int)
+{
+  private val state_reg = Reg(UInt((n-1).W))
+  def access(way: UInt) {
+    state_reg := get_next_state(state_reg,way)
+  }
+  def access(ways: Seq[ValidIO[UInt]]) {
+    state_reg := ways.foldLeft(state_reg)((prev, way) => Mux(way.valid, get_next_state(prev, way.bits), prev))
+  }
+  def get_next_state(state: UInt, way: UInt) = {
+    var next_state = state << 1
+    var idx = 1.U(1.W)
+    for (i <- log2Up(n)-1 to 0 by -1) {
+      val bit = way(i)
+      next_state = next_state.bitSet(idx, !bit)
+      idx = Cat(idx, bit)
+    }
+    next_state(n-1, 1)
+  }
+  def replace = get_replace_way(state_reg)
+  def get_replace_way(state: UInt) = {
+    val shifted_state = state << 1
+    var idx = 1.U(1.W)
+    for (i <- log2Up(n)-1 to 0 by -1) {
+      val in_bounds = Cat(idx, (BigInt(1) << i).U)(log2Up(n)-1, 0) < n.U
+      idx = Cat(idx, in_bounds && shifted_state(idx))
+    }
+    idx(log2Up(n)-1,0)
+  }
+}
+
+class SeqPLRU(n_sets: Int, n_ways: Int) extends SeqReplacementPolicy {
+  val state = SyncReadMem(n_sets, UInt((n_ways-1).W))
+  val logic = new PseudoLRU(n_ways)
+  val current_state = Wire(UInt())
+  val plru_way = logic.get_replace_way(current_state)
+  val next_state = Wire(UInt())
+
+  def access(set: UInt) = {
+    current_state := state.read(set)
+  }
+
+  def update(valid: Bool, hit: Bool, set: UInt, way: UInt) = {
+    val update_way = Mux(hit, way, plru_way)
+    next_state := logic.get_next_state(current_state, update_way)
+    when (valid) { state.write(set, next_state) }
+  }
+
+  def way = plru_way
+}
--- a/src/main/scala/xiangshan/XSCore.scala
+++ b/src/main/scala/xiangshan/XSCore.scala
@@ -9,7 +9,10 @@ import xiangshan.backend.dispatch.DP1Parameters
 import xiangshan.backend.exu.ExuParameters
 import xiangshan.frontend.Frontend
 import xiangshan.mem._
+import xiangshan.mem.cache.ICacheParameters
+import xiangshan.mem.cache.DCacheParameters
 import xiangshan.utils._
+import bus.tilelink.TLParameters

 trait HasXSParameter {
  val XLEN = 64
@@ -60,6 +63,24 @@ trait HasXSParameter {
    LduCnt = 0,
    StuCnt = 1
  )
+
+  val l1BusDataWidth = 64
+  val l1BusParams = TLParameters(
+    addressBits = PAddrBits,
+    dataBits = l1BusDataWidth,
+    sourceBits = 3,
+    sinkBits = 3
+  )
+
+  val icacheParameters = ICacheParameters(
+  )
+
+  // the width of LSU to DCache IO
+  val memWidth = 2
+  val LRSCCycles = 16
+  val dcacheParameters = DCacheParameters(
+    busParams = l1BusParams
+  )
 }

 trait HasXSLog { this: Module =>

--- a/src/main/scala/xiangshan/mem/Mem.scala
+++ b/src/main/scala/xiangshan/mem/Mem.scala
+// See LICENSE.Berkeley for license details.
+
+package xiangshan.mem
+
+import chisel3._
+import chisel3.util._
+import xiangshan.XSBundle
+
+trait MemoryOpConstants {
+  val META_SZ   = 64
+  val NUM_XA_OPS = 9
+  val M_SZ      = 5
+  def M_X       = BitPat("b?????")
+  def M_XRD     = "b00000".U // int load
+  def M_XWR     = "b00001".U // int store
+  def M_PFR     = "b00010".U // prefetch with intent to read
+  def M_PFW     = "b00011".U // prefetch with intent to write
+  def M_XA_SWAP = "b00100".U
+  def M_FLUSH_ALL = "b00101".U  // flush all lines
+  def M_XLR     = "b00110".U
+  def M_XSC     = "b00111".U
+  def M_XA_ADD  = "b01000".U
+  def M_XA_XOR  = "b01001".U
+  def M_XA_OR   = "b01010".U
+  def M_XA_AND  = "b01011".U
+  def M_XA_MIN  = "b01100".U
+  def M_XA_MAX  = "b01101".U
+  def M_XA_MINU = "b01110".U
+  def M_XA_MAXU = "b01111".U
+  def M_FLUSH   = "b10000".U // write back dirty data and cede R/W permissions
+  def M_PWR     = "b10001".U // partial (masked.U store
+  def M_PRODUCE = "b10010".U // write back dirty data and cede W permissions
+  def M_CLEAN   = "b10011".U // write back dirty data and retain R/W permissions
+  def M_SFENCE  = "b10100".U // flush TLB
+  def M_WOK     = "b10111".U // check write permissions but don't perform a write
+
+  def isAMOLogical(cmd: UInt) = cmd === M_XA_SWAP || cmd === M_XA_XOR || cmd === M_XA_OR || cmd === M_XA_AND
+  def isAMOArithmetic(cmd: UInt) = cmd === M_XA_ADD || cmd === M_XA_MIN || cmd === M_XA_MAX || cmd === M_XA_MINU || cmd === M_XA_MAXU
+  def isAMO(cmd: UInt) = isAMOLogical(cmd) || isAMOArithmetic(cmd)
+  def isPrefetch(cmd: UInt) = cmd === M_PFR || cmd === M_PFW
+  def isRead(cmd: UInt) = cmd === M_XRD || cmd === M_XLR || cmd === M_XSC || isAMO(cmd)
+  def isWrite(cmd: UInt) = cmd === M_XWR || cmd === M_PWR || cmd === M_XSC || isAMO(cmd)
+  def isWriteIntent(cmd: UInt) = isWrite(cmd) || cmd === M_PFW || cmd === M_XLR
+}
+
+object MemoryOpConstants extends MemoryOpConstants {
+}
+
+class MemBundle extends XSBundle
+  with MemoryOpConstants
+
+class DCacheReq extends MemBundle
+{
+  val cmd  = UInt(M_SZ.W)
+  val addr  = UInt(PAddrBits.W)
+  val data  = UInt(DataBits.W)
+  val mask  = UInt((DataBits/8).W)
+  val meta  = UInt(META_SZ.W)
+}
+
+class DCacheResp extends MemBundle
+{
+  val data = UInt(DataBits.W)
+  val meta  = UInt(META_SZ.W)
+  val nack  = Bool()
+}
+
+class LSUDMemIO extends MemBundle
+{
+  val req = new DecoupledIO(Vec(memWidth, Valid(new DCacheReq)))
+  val resp = Flipped(Vec(memWidth, new ValidIO(new DCacheResp)))
+}
--- a/src/main/scala/xiangshan/mem/MemPipeline.scala
+++ b/src/main/scala/xiangshan/mem/MemPipeline.scala
@@ -25,7 +25,7 @@ class MemPipeline(implicit val p: XSConfig) extends XSModule with NeedImpl{
  })

  val lsu = Module(new Lsu)
-  val dcache = Module(new Dcache)
+  val dcache = Module(new DCache)
  val mshq = Module(new MSHQ)
  val dtlb = Module(new Dtlb)
  val lsroq = Module(new LsRoq)
@@ -38,4 +38,4 @@ class MemPipeline(implicit val p: XSConfig) extends XSModule with NeedImpl{
  lsroq.io := DontCare
  sbuffer.io := DontCare

-}
\ No newline at end of file
+}
--- a/src/main/scala/xiangshan/mem/cache/AMOALU.scala
+++ b/src/main/scala/xiangshan/mem/cache/AMOALU.scala
+// See LICENSE.SiFive for license details.
+// See LICENSE.Berkeley for license details.
+
+package xiangshan.mem.cache
+
+import chisel3._
+import chisel3.util._
+
+import xiangshan.mem.MemoryOpConstants
+
+class StoreGen(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) {
+  val size = typ(log2Up(log2Up(maxSize)+1)-1,0)
+  def misaligned =
+    (addr & ((1.U << size) - 1.U)(log2Up(maxSize)-1,0)).orR
+
+  def mask = {
+    var res = 1.U
+    for (i <- 0 until log2Up(maxSize)) {
+      val upper = Mux(addr(i), res, 0.U) | Mux(size >= (i+1).U, ((BigInt(1) << (1 << i))-1).U, 0.U)
+      val lower = Mux(addr(i), 0.U, res)
+      res = Cat(upper, lower)
+    }
+    res
+  }
+
+  protected def genData(i: Int): UInt =
+    if (i >= log2Up(maxSize)) dat
+    else Mux(size === i.U, Fill(1 << (log2Up(maxSize)-i), dat((8 << i)-1,0)), genData(i+1))
+
+  def data = genData(0)
+  def wordData = genData(2)
+}
+
+class LoadGen(typ: UInt, signed: Bool, addr: UInt, dat: UInt, zero: Bool, maxSize: Int) {
+  private val size = new StoreGen(typ, addr, dat, maxSize).size
+
+  private def genData(logMinSize: Int): UInt = {
+    var res = dat
+    for (i <- log2Up(maxSize)-1 to logMinSize by -1) {
+      val pos = 8 << i
+      val shifted = Mux(addr(i), res(2*pos-1,pos), res(pos-1,0))
+      val doZero = (i == 0).B && zero
+      val zeroed = Mux(doZero, 0.U, shifted)
+      res = Cat(Mux(size === i.U || doZero, Fill(8*maxSize-pos, signed && zeroed(pos-1)), res(8*maxSize-1,pos)), zeroed)
+    }
+    res
+  }
+
+  def wordData = genData(2)
+  def data = genData(0)
+}
+
+class AMOALU(operandBits: Int) extends Module
+  with MemoryOpConstants {
+  val minXLen = 32
+  val widths = (0 to log2Ceil(operandBits / minXLen)).map(minXLen << _)
+
+  val io = new Bundle {
+    val mask = Input(UInt((operandBits/8).W))
+    val cmd = Input(Bits(M_SZ.W))
+    val lhs = Input(Bits(operandBits.W))
+    val rhs = Input(Bits(operandBits.W))
+    val out = Output(Bits(operandBits.W))
+    val out_unmasked = Output(Bits(operandBits.W))
+  }
+
+  val max = io.cmd === M_XA_MAX || io.cmd === M_XA_MAXU
+  val min = io.cmd === M_XA_MIN || io.cmd === M_XA_MINU
+  val add = io.cmd === M_XA_ADD
+  val logic_and = io.cmd === M_XA_OR || io.cmd === M_XA_AND
+  val logic_xor = io.cmd === M_XA_XOR || io.cmd === M_XA_OR
+
+  val adder_out = {
+    // partition the carry chain to support sub-xLen addition
+    val mask = ~(0.U(operandBits.W) +: widths.init.map(w => !io.mask(w/8-1) << (w-1))).reduce(_|_)
+    (io.lhs & mask) + (io.rhs & mask)
+  }
+
+  val less = {
+    // break up the comparator so the lower parts will be CSE'd
+    def isLessUnsigned(x: UInt, y: UInt, n: Int): Bool = {
+      if (n == minXLen) x(n-1, 0) < y(n-1, 0)
+      else x(n-1, n/2) < y(n-1, n/2) || x(n-1, n/2) === y(n-1, n/2) && isLessUnsigned(x, y, n/2)
+    }
+
+    def isLess(x: UInt, y: UInt, n: Int): Bool = {
+      val signed = {
+        val mask = M_XA_MIN ^ M_XA_MINU
+        (io.cmd & mask) === (M_XA_MIN & mask)
+      }
+      Mux(x(n-1) === y(n-1), isLessUnsigned(x, y, n), Mux(signed, x(n-1), y(n-1)))
+    }
+
+    PriorityMux(widths.reverse.map(w => (io.mask(w/8/2), isLess(io.lhs, io.rhs, w))))
+  }
+
+  val minmax = Mux(Mux(less, min, max), io.lhs, io.rhs)
+  val logic =
+    Mux(logic_and, io.lhs & io.rhs, 0.U) |
+    Mux(logic_xor, io.lhs ^ io.rhs, 0.U)
+  val out =
+    Mux(add,                    adder_out,
+    Mux(logic_and || logic_xor, logic,
+                                minmax))
+
+  val wmask = FillInterleaved(8, io.mask)
+  io.out := wmask & out | ~wmask & io.lhs
+  io.out_unmasked := out
+}
--- a/src/main/scala/xiangshan/mem/cache/L1Cache.scala
+++ b/src/main/scala/xiangshan/mem/cache/L1Cache.scala
+// See LICENSE.SiFive for license details.
+
+package xiangshan.mem.cache
+
+import chisel3._
+import chisel3.util._
+
+import xiangshan.HasXSParameter
+import xiangshan.mem.MemoryOpConstants
+
+// this file contains common building blocks that can be shared by ICache and DCache
+// this is the common parameter base for L1 ICache and L1 DCache
+trait L1CacheParameters {
+  def nSets:         Int
+  def nWays:         Int
+  def rowBits:       Int
+  def nTLBEntries:   Int
+  def blockBytes:    Int
+}
+
+trait HasL1CacheParameters extends HasXSParameter
+  with MemoryOpConstants {
+  val cacheParams: L1CacheParameters
+
+  def nSets = cacheParams.nSets
+  def blockOffBits = log2Up(cacheParams.blockBytes)
+  def idxBits = log2Up(cacheParams.nSets)
+  def untagBits = blockOffBits + idxBits
+  // 4K page
+  def pgIdxBits = 12
+  def pgUntagBits = untagBits min pgIdxBits
+
+  // L1 cache are all physically tagged cache
+  def tagBits = PAddrBits - pgUntagBits
+  def nWays = cacheParams.nWays
+  def wayBits = log2Up(nWays)
+  def rowBits = cacheParams.rowBits
+  def rowBytes = rowBits/8
+  def rowOffBits = log2Up(rowBytes)
+  def nTLBEntries = cacheParams.nTLBEntries
+
+  def cacheDataBits = l1BusDataWidth
+  def cacheDataBytes = cacheDataBits / 8
+  def cacheDataBeats = (cacheParams.blockBytes * 8) / cacheDataBits
+  def refillCycles = cacheDataBeats
+}
+
+abstract class L1CacheModule extends Module
+  with HasL1CacheParameters
+
+abstract class L1CacheBundle extends Bundle
+  with HasL1CacheParameters
--- a/src/main/scala/xiangshan/mem/cache/dcache.scala
+++ b/src/main/scala/xiangshan/mem/cache/dcache.scala
--- a/src/main/scala/xiangshan/mem/cache/icache.scala
+++ b/src/main/scala/xiangshan/mem/cache/icache.scala
+package xiangshan.mem.cache
+
+import chisel3._
+import chisel3.util._
+import xiangshan._
+import xiangshan.utils._
+import chisel3.util.experimental.BoringUtils
+import xiangshan.backend.decode.XSTrap
+import xiangshan.mem._
+
+import bus.tilelink.TLParameters
+import bus.tilelink.TLPermissions
+import bus.tilelink.ClientMetadata
+import _root_.utils.{Code, RandomReplacement}
+
+// DCache specific parameters
+// L1 DCache is 64set, 8way-associative, with 64byte block, a total of 32KB
+// It's a virtually indexed, physically tagged cache.
+case class ICacheParameters(
+    nSets: Int = 64,
+    nWays: Int = 8,
+    rowBits: Int = 64,
+    nTLBEntries: Int = 32,
+    tagECC: Option[String] = None,
+    dataECC: Option[String] = None,
+    dataECCBytes: Int = 1,
+    nMSHRs: Int = 1,
+    nSDQ: Int = 17,
+    nRPQ: Int = 16,
+    nMMIOs: Int = 1,
+    blockBytes: Int = 64) extends L1CacheParameters {
+
+  def tagCode: Code = Code.fromString(tagECC)
+  def dataCode: Code = Code.fromString(dataECC)
+
+  def replacement = new RandomReplacement(nWays)
+}
+
+trait HasICacheParameters extends HasL1CacheParameters {
+  val cacheParams = dcacheParameters
+  val cfg = cacheParams
+
+  // the width of inner CPU data interface
+  def wordBits = DataBits
+  def wordBytes = DataBytes
+  def wordOffBits = log2Up(wordBytes)
+  def beatBytes = cfg.blockBytes / cacheDataBeats
+  def beatWords = beatBytes / wordBytes
+  def beatOffBits = log2Up(beatBytes)
+  def idxMSB = untagBits-1
+  def idxLSB = blockOffBits
+  def offsetmsb = idxLSB-1
+  def offsetlsb = wordOffBits
+  def rowWords = rowBits/wordBits
+  def doNarrowRead = DataBits * nWays % rowBits == 0
+  def eccBytes = cacheParams.dataECCBytes
+  val eccBits = cacheParams.dataECCBytes * 8
+  val encBits = cacheParams.dataCode.width(eccBits)
+  val encWordBits = encBits * (wordBits / eccBits)
+  def encDataBits = cacheParams.dataCode.width(wordBits) // NBDCache only
+  def encRowBits = encDataBits*rowWords
+
+  require(isPow2(nSets), s"nSets($nSets) must be pow2")
+  // To make things easier, now we assume:
+  // core_data_width(wordBits) == L1_basic_storage_unit_width(rowBits) ==
+  // outer_tilelink_interface_width(cacheDataBits)
+  require(rowBits == wordBits, s"rowBits($rowBits) != wordBits($wordBits)")
+  require(rowBits == cacheDataBits, s"rowBits($rowBits) != cacheDataBits($cacheDataBits)")
+}
+
+abstract class ICacheModule extends Module
+  with HasICacheParameters
+
+abstract class ICacheBundle extends Bundle
+  with HasICacheParameters
+
+class ICacheMetaReadReq extends ICacheBundle {
+  val req = Vec(memWidth, new L1MetaReadReq)
+}
+
+class ICacheDataReadReq extends ICacheBundle {
+  val req = Vec(memWidth, new L1DataReadReq)
+  val valid = Vec(memWidth, Bool())
+}
--- a/src/main/scala/xiangshan/mem/cache/mshrs.scala
+++ b/src/main/scala/xiangshan/mem/cache/mshrs.scala
--- a/src/main/scala/xiangshan/mem/cache/wbu.scala
+++ b/src/main/scala/xiangshan/mem/cache/wbu.scala
+package xiangshan.mem.cache
+
+import chisel3._
+import chisel3.util._
+import chisel3.util.experimental.BoringUtils
+
+import bus.tilelink._
+
+class WritebackReq extends DCacheBundle {
+  val tag = Bits(tagBits.W)
+  val idx = Bits(idxBits.W)
+  // TODO: make it configurable
+  // 问题：这个source就是mshr id吗？那假如是响应probe的请求，那又如何处理呢？
+  val source = UInt(cfg.busParams.sourceBits.W)
+  val param = UInt(TLPermissions.cWidth.W) 
+  val way_en = Bits(nWays.W)
+  // 如果是WBU下来的应该是voluntary的吧？
+  val voluntary = Bool()
+}
+
+class WritebackUnit extends DCacheModule {
+  val io = new Bundle {
+    val req = Flipped(Decoupled(new WritebackReq()))
+    // 这个是啥？
+    val resp = Output(Bool())
+    // 这个是干啥用的啊？
+    // 暂时先简单起见，把无关的都去掉啊！
+    val data_req = Decoupled(new L1DataReadReq)
+    val data_resp = Input(UInt(encRowBits.W))
+    val release = Decoupled(new TLBundleC(cfg.busParams))
+    val mem_grant = Decoupled(new TLBundleD(cfg.busParams))
+  }
+
+  // 同时处理的request只能有一个
+  val req = Reg(new WritebackReq())
+  val s_invalid :: s_fill_buffer :: s_active :: s_grant :: Nil = Enum(4)
+  val state = RegInit(s_invalid)
+  // 这俩都是啥？
+  // r1、r2都是啥？
+  // 这边之所以要处理成r1，r2是因为数据变成读了之后，要等两拍才出来，所以才必须得搞这种幺蛾子啊。
+  // 那么现在的问题是，为啥数据必须得等两拍才出来呢？why？
+  // 似乎是因为bank冲突的逻辑太复杂了？
+  val r1_data_req_fired = RegInit(false.B)
+  val r2_data_req_fired = RegInit(false.B)
+  val r1_data_req_cnt = Reg(UInt(log2Up(refillCycles+1).W))
+  val r2_data_req_cnt = Reg(UInt(log2Up(refillCycles+1).W))
+  val data_req_cnt = RegInit(0.U(log2Up(refillCycles+1).W))
+  val (_, last_beat, all_beats_done, beat_count) = TLUtilities.count(io.release)
+  // 这边怎么还搞了个wb buffer呢？
+  // 使用一个buffer，而不是直接挂到总线请求行，可以让同步方便一点
+  // 毕竟dcache出口是没有ready，valid的
+  // 假如因为总线没有就绪，就不停地replay，反而太复杂了，所以还是先写到buffer里面好啊。
+  val wb_buffer = Reg(Vec(refillCycles, UInt(encRowBits.W)))
+  val acked = RegInit(false.B)
+
+  io.release.valid   := false.B
+  io.release.bits    := DontCare
+  io.req.ready       := false.B
+  io.data_req.valid  := false.B
+  io.data_req.bits   := DontCare
+  io.resp            := false.B
+
+  val r_address = Cat(req.tag, req.idx) << blockOffBits
+  val id = cfg.nMSHRs
+  // 这边还要响应probe？
+  val probeResponse = TLMasterUtilities.ProbeAck(
+                          params = cfg.busParams,
+                          fromSource = id.U,
+                          toAddress = r_address,
+                          lgSize = log2Ceil(cfg.blockBytes).U,
+                          reportPermissions = req.param,
+                          data = wb_buffer(data_req_cnt))
+
+  val voluntaryRelease = TLMasterUtilities.Release(
+                          params = cfg.busParams,
+                          fromSource = id.U,
+                          toAddress = r_address,
+                          lgSize = log2Ceil(cfg.blockBytes).U,
+                          shrinkPermissions = req.param,
+                          data = wb_buffer(data_req_cnt))._2
+
+
+  when (state === s_invalid) {
+    io.req.ready := true.B
+    when (io.req.fire()) {
+      state := s_fill_buffer
+      data_req_cnt := 0.U
+      req := io.req.bits
+      acked := false.B
+    }
+  }
+  
+  // 所以根据这个时序安排的话，就是：
+  // data_req_cnt: 读请求发出
+  // r1_data_req_cnt: 读请求发出后的下一个周期变成valid
+  // r2_data_req_cnt: 读请求发出后的下下个周期变成valid，此时开始出数据
+  // 我甚至怀疑这里写的meta只是单纯为了定序？
+  when (state === s_fill_buffer) {
+    io.data_req.valid := data_req_cnt < refillCycles.U
+    io.data_req.bits.way_en := req.way_en
+    io.data_req.bits.addr := (if(refillCycles > 1)
+                              Cat(req.idx, data_req_cnt(log2Up(refillCycles)-1,0))
+                            else req.idx) << rowOffBits
+
+    r1_data_req_fired := false.B
+    r1_data_req_cnt   := 0.U
+    r2_data_req_fired := r1_data_req_fired
+    r2_data_req_cnt   := r1_data_req_cnt
+    when (io.data_req.fire()) {
+      // 当fire的时候，r1就变成true了
+      r1_data_req_fired := true.B
+      r1_data_req_cnt   := data_req_cnt
+      data_req_cnt := data_req_cnt + 1.U
+    }
+    when (r2_data_req_fired) {
+      wb_buffer(r2_data_req_cnt) := io.data_resp
+      when (r2_data_req_cnt === (refillCycles-1).U) {
+        // 为啥当数据全部读上来时，就开始resp为true了呢？why？
+        io.resp := true.B
+        state := s_active
+        data_req_cnt := 0.U
+      }
+    }
+  } .elsewhen (state === s_active) {
+    io.release.valid := data_req_cnt < refillCycles.U
+    // 这两个应该就只是一些域不一样吧？
+    io.release.bits := Mux(req.voluntary, voluntaryRelease, probeResponse)
+
+    // 问题：为啥会在这里出现一个这个呢？why？
+    when (io.mem_grant.fire()) {
+      acked := true.B
+    }
+    when (io.release.fire()) {
+      data_req_cnt := data_req_cnt + 1.U
+    }
+    when ((data_req_cnt === (refillCycles-1).U) && io.release.fire()) {
+      // 似乎是voluntary的时候，就需要搞个等待master的grant，假如不是voluntary的时候，就不需要等待，就直接OK了？
+      state := Mux(req.voluntary, s_grant, s_invalid)
+    }
+  } .elsewhen (state === s_grant) {
+    when (io.mem_grant.fire()) {
+      acked := true.B
+    }
+    when (acked) {
+      state := s_invalid
+    }
+  }
+}
--- a/src/main/scala/xiangshan/mem/pipeline/Lsu.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/Lsu.scala
@@ -91,7 +91,7 @@ class LsuIO extends XSBundle with HasMEMConst {
  val stin = Vec(2, Flipped(Decoupled(new StuReq)))
  val out = Vec(2, Decoupled(new ExuOutput))
  val redirect = Flipped(ValidIO(new Redirect))
-  val dcache = Flipped(new DcacheIO)
+  val dcache = Flipped(new LSUDMemIO)
  val dtlb = Flipped(new DtlbIO)
  // lsroq
  // sbuffer
@@ -381,4 +381,4 @@ class Lsu(implicit val p: XSConfig) extends XSModule with HasMEMConst with NeedI

 // update store buffer according to store fill buffer

-}
\ No newline at end of file
+}
--- a/src/test/scala/xiangshan/backend/exu/DCacheTest.scala
+++ b/src/test/scala/xiangshan/backend/exu/DCacheTest.scala
+package xiangshan.backend.exu
+
+import org.scalatest._
+import scala.collection.mutable.{Map, Queue}
+
+import chisel3._
+import chisel3.experimental.BundleLiterals._
+import chiseltest._
+
+import xiangshan.XSModule
+import xiangshan.mem.{LSUDMemIO, MemoryOpConstants}
+import xiangshan.mem.cache.DCache
+import bus.tilelink.NaiveTLToAXI4
+import device.AXI4RAM
+
+class DCacheDut extends XSModule {
+  val io = IO(new Bundle() {
+    val in = Flipped(new LSUDMemIO)
+  })
+
+  val dcache = Module(new DCache)
+  val mem = Module(new AXI4RAM(memByte = 128 * 1024 * 1024, useBlackBox = true))
+  val tlToAXI = Module(new NaiveTLToAXI4(l1BusParams))
+
+  dcache.io.lsu <> io.in
+  dcache.io.bus <> tlToAXI.io.in
+  tlToAXI.io.out <> mem.in
+}
+
+
+case class Req(
+  cmd: UInt,
+  addr: Long,
+  data: Long,
+  mask: Long,
+  meta: Long
+)
+
+case class Resp(
+  data: Long,
+  meta: Long
+)
+
+class DCacheTest extends FlatSpec with ChiselScalatestTester with Matchers {
+  behavior of "DCache"
+
+  it should "do load store correctly" in {
+    test(new DCacheDut) { c =>
+      val CMD_READ = MemoryOpConstants.M_XRD
+      val CMD_WRITE = MemoryOpConstants.M_XWR
+      val FULL_MASK = 0xff
+
+      val BASE_ADDR = 0x80000000L
+      val MEM_SIZE = 128 * 1024 * 1024
+
+      // for now, we only support load/store of 64bit integers
+      val INTEGER_SIZE = 8
+      val num_integers = MEM_SIZE / INTEGER_SIZE
+
+      // data structures
+      // our golden version cache
+      val mem = new Array[Long](num_integers)
+      var num_retired_reqs = 0
+
+      // at each clock, we try to issue the request bundle at the head
+      val issue_queue = Queue[Array[Req]]()
+      // map that store all requests, map req id to req
+      // whenever you want to replay a req, you can get the req with its id
+      var all_requests:Map[Long,Req] = Map()
+
+      // 之前的请求是否在等待req ready？
+      var req_waiting:Boolean = false
+
+
+      def init_test = {
+        req_waiting = false
+        num_retired_reqs = 0
+        issue_queue.clear
+        all_requests.clear
+      }
+
+      // 向某个特定的channel上发送req
+      def send_req_channel(req: Req, channel: Int) = {
+        val r = c.io.in.req.bits(channel)
+        r.bits.cmd.poke(req.cmd)
+        r.bits.addr.poke(req.addr.U)
+        r.bits.data.poke(req.data.U)
+        r.bits.mask.poke(req.mask.U)
+        r.bits.meta.poke(req.meta.U)
+        r.valid.poke(true.B)
+      }
+
+      // send a bundle of reqs in the same cycle
+      def send_req_bundle(reqs: Array[Req]) = {
+        for (i <- 0 to reqs.length - 1) {
+          send_req_channel(reqs(i), i)
+        }
+        c.io.in.req.valid.poke(true.B)
+      }
+
+      def send_req: Unit = {
+        // no more requests to issue
+        if (issue_queue.isEmpty)
+          return
+
+        // there are no requests waiting for handshake
+        // we may send a new request during this clock
+        if (!req_waiting) {
+          req_waiting = true
+          send_req_bundle(issue_queue.front)
+        }
+
+        // reqs can be fired
+        if (c.io.in.req.ready.peek().litToBoolean) {
+          req_waiting = false
+          issue_queue.dequeue()
+        }
+      }
+
+      def handle_resp = {
+        for (i <- 0 to 1) {
+          val resp = c.io.in.resp(i)
+          if (resp.valid.peek().litToBoolean) {
+            val original_req = all_requests(resp.bits.meta.peek().litValue.longValue)
+            // needs to be replayed
+            if (resp.bits.nack.peek().litToBoolean) {
+              issue_queue.enqueue(Array[Req](original_req))
+            } else {
+              num_retired_reqs += 1
+              if (original_req.cmd.litValue == CMD_READ.litValue) {
+                resp.bits.data.expect(mem(original_req.addr.toInt).U)
+              }
+            }
+          }
+        }
+      }
+
+      val r = scala.util.Random
+
+      // ----------------------------------------
+      // store test
+      init_test
+
+      // first, initialize every memory cell with random numbers
+      for (i <- 0 to num_integers - 1) {
+        val randomNumber = r.nextLong
+        val req = Req(CMD_WRITE, BASE_ADDR + i * INTEGER_SIZE, randomNumber, FULL_MASK, i)
+        issue_queue.enqueue(Array[Req](req))
+        all_requests += (i.toLong -> req)
+        mem(i) = randomNumber
+      }
+
+      while (num_retired_reqs < num_integers) {
+        send_req
+        handle_resp
+        c.clock.step()
+      }
+
+      // read out every integer
+      // ----------------------------------------
+      // read test
+      init_test
+
+      // first, initialize every memory cell with random numbers
+      for (i <- 0 to num_integers - 1) {
+        val req = Req(CMD_READ, BASE_ADDR + i * INTEGER_SIZE, 0, FULL_MASK, i)
+        issue_queue.enqueue(Array[Req](req))
+        all_requests += (i.toLong -> req)
+      }
+
+      while (num_retired_reqs < num_integers) {
+        send_req
+        handle_resp
+        c.clock.step()
+      }
+    }
+  }
+}