L1DCache: a complete rewrite.

Now, it can compile.

L1DCache: a complete rewrite.
Now, it can compile.
743bc277 · Allen · 98c3bf7a · 743bc277 · 743bc277 · 743bc277
33 changed file
--- a/src/main/scala/utils/BitUtils.scala
+++ b/src/main/scala/utils/BitUtils.scala
@@ -58,6 +58,8 @@ object OneHot {
  def OH1ToUInt(x: UInt): UInt = OHToUInt(OH1ToOH(x))
  def UIntToOH1(x: UInt, width: Int): UInt = ~((-1).S(width.W).asUInt << x)(width-1, 0)
  def UIntToOH1(x: UInt): UInt = UIntToOH1(x, (1 << x.getWidth) - 1)
+  def checkOneHot(in: Bits): Unit = assert(PopCount(in) <= 1.U)
+  def checkOneHot(in: Iterable[Bool]): Unit = assert(PopCount(in) <= 1.U)
 }

 object LowerMask {

--- a/src/main/scala/xiangshan/XSCore.scala
+++ b/src/main/scala/xiangshan/XSCore.scala
@@ -191,8 +191,9 @@ trait HasXSParameter {
    tagECC = Some("secded"),
    dataECC = Some("secded"),
    nMissEntries = 16,
-    nLoadMissEntries = 8,
-    nStoreMissEntries = 8
+    nProbeEntries = 16,
+    nReleaseEntries = 16,
+    nStoreReplayEntries = 16
  )

  val LRSCCycles = 100

--- a/src/main/scala/xiangshan/cache/AtomicsReplayUnit.scala
+++ b/src/main/scala/xiangshan/cache/AtomicsReplayUnit.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+
+import utils.XSDebug
+
+class AtomicsReplayEntry extends DCacheModule
+{
+  val io = IO(new Bundle {
+    val lsu  = Flipped(new DCacheWordIO)
+    val pipe_req  = Decoupled(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+
+    val block_addr  = Output(Valid(UInt()))
+  })
+
+  val s_invalid :: s_pipe_req :: s_pipe_resp :: s_resp :: Nil = Enum(4)
+  val state = RegInit(s_invalid)
+
+  val req = Reg(new DCacheLineReq)
+
+  // assign default values to output signals
+  io.lsu.req.ready     := state === s_invalid
+  io.lsu.resp.valid    := false.B
+  io.lsu.resp.bits     := DontCare
+
+  io.pipe_req.valid    := false.B
+  io.pipe_req.bits     := DontCare
+
+  io.block_addr.valid := state =/= s_invalid
+  io.block_addr.bits  := req.addr
+
+
+  when (state =/= s_invalid) {
+    XSDebug("AtomicsReplayEntry: state: %d block_addr: %x\n", state, io.block_addr.bits)
+  }
+
+  // --------------------------------------------
+  // s_invalid: receive requests
+  when (state === s_invalid) {
+    when (io.lsu.req.fire()) {
+      req   := io.lsu.req.bits
+      state := s_pipe_req
+    }
+  }
+
+  // --------------------------------------------
+  // replay
+  when (state === s_pipe_req) {
+    io.pipe_req.valid := true.B
+
+    val pipe_req = io.pipe_req.bits
+    pipe_req := DontCare
+    pipe_req.miss := false.B
+    pipe_req.probe := false.B
+    pipe_req.source := AMO_SOURCE.U
+    pipe_req.cmd    := req.cmd
+    pipe_req.addr   := get_block_addr(req.addr)
+    pipe_req.word_idx  := get_word(req.addr)
+    pipe_req.amo_data  := req.data
+    pipe_req.amo_mask  := req.mask
+
+    when (io.pipe_req.fire()) {
+      state := s_pipe_resp
+    }
+  }
+
+  val resp_data = Reg(UInt())
+  when (state === s_pipe_resp) {
+    // when not miss
+    // everything is OK, simply send response back to sbuffer
+    // when miss and not replay
+    // wait for missQueue to handling miss and replaying our request
+    // when miss and replay
+    // req missed and fail to enter missQueue, manually replay it later
+    // TODO: add assertions:
+    // 1. add a replay delay counter?
+    // 2. when req gets into MissQueue, it should not miss any more
+    when (io.pipe_resp.fire()) {
+      when (io.pipe_resp.bits.miss) {
+        when (io.pipe_resp.bits.replay) {
+          state := s_pipe_req
+        }
+      } .otherwise {
+        resp_data := io.pipe_resp.bits.data
+        state := s_resp
+      }
+    }
+  }
+
+  // --------------------------------------------
+  when (state === s_resp) {
+    io.lsu.resp.valid := true.B
+    io.lsu.resp.bits  := DontCare
+    io.lsu.resp.bits.data := resp_data
+    io.lsu.resp.bits.id   := req.id
+
+    when (io.lsu.resp.fire()) {
+      state := s_invalid
+    }
+  }
+
+  // debug output
+  when (io.lsu.req.fire()) {
+    io.lsu.req.bits.dump()
+  }
+
+  when (io.lsu.resp.fire()) {
+    io.lsu.resp.bits.dump()
+  }
+
+  when (io.pipe_req.fire()) {
+    io.pipe_req.bits.dump()
+  }
+
+  when (io.pipe_resp.fire()) {
+    io.pipe_resp.bits.dump()
+  }
+}
--- a/src/main/scala/xiangshan/cache/dcache.scala
+++ b/src/main/scala/xiangshan/cache/dcache.scala
@@ -20,12 +20,10 @@ case class DCacheParameters
    tagECC: Option[String] = None,
    dataECC: Option[String] = None,
    nMissEntries: Int = 1,
-    nLoadMissEntries: Int = 1,
-    nStoreMissEntries: Int = 1,
-    nMiscMissEntries: Int = 1,
+    nProbeEntries: Int = 1,
+    nReleaseEntries: Int = 1,
+    nStoreReplayEntries: Int = 1,
    nMMIOEntries: Int = 1,
-    nSDQ: Int = 17,
-    nRPQ: Int = 16,
    nMMIOs: Int = 1,
    blockBytes: Int = 64
 ) extends L1CacheParameters {
@@ -48,23 +46,12 @@ trait HasDCacheParameters extends HasL1CacheParameters {
  def nIOMSHRs = cacheParams.nMMIOs
  def maxUncachedInFlight = cacheParams.nMMIOs

-  def missQueueEntryIdWidth = log2Up(cfg.nMissEntries)
-  def loadMissQueueEntryIdWidth = log2Up(cfg.nLoadMissEntries)
-  def storeMissQueueEntryIdWidth = log2Up(cfg.nStoreMissEntries)
-  def miscMissQueueEntryIdWidth = log2Up(cfg.nMiscMissEntries)
-  def clientMissQueueEntryIdWidth = max(
-    max(loadMissQueueEntryIdWidth,
-      storeMissQueueEntryIdWidth),
-      miscMissQueueEntryIdWidth)
-
-  // clients: ldu 0, ldu1, stu, atomics
-  def nClientMissQueues = 4
-  def clientIdWidth = log2Up(nClientMissQueues)
-  def missQueueClientIdWidth = clientIdWidth + clientMissQueueEntryIdWidth
-  def clientIdMSB = missQueueClientIdWidth - 1
-  def clientIdLSB = clientMissQueueEntryIdWidth
-  def entryIdMSB = clientMissQueueEntryIdWidth - 1
-  def entryIdLSB = 0
+  def nSourceType = 3
+  def sourceTypeWidth = log2Up(nSourceType)
+  def LOAD_SOURCE = 0
+  def STORE_SOURCE = 1
+  def AMO_SOURCE = 2
+  // each source use a id to distinguish its multiple reqs
  def reqIdWidth = 64

  require(isPow2(nSets), s"nSets($nSets) must be pow2")
@@ -73,6 +60,7 @@ trait HasDCacheParameters extends HasL1CacheParameters {
  require(full_divide(beatBits, rowBits), s"beatBits($beatBits) must be multiple of rowBits($rowBits)")
  // this is a VIPT L1 cache
  require(pgIdxBits >= untagBits, s"page aliasing problem: pgIdxBits($pgIdxBits) < untagBits($untagBits)")
+  require(rowWords == 1, "Our DCache Implementation assumes rowWords == 1")
 }

 abstract class DCacheModule extends L1CacheModule

--- a/src/main/scala/xiangshan/cache/DCacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/DCacheWrapper.scala
+package xiangshan.cache
+
+import chipsalliance.rocketchip.config.Parameters
+import chisel3._
+import chisel3.util._
+import xiangshan._
+import utils._
+import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, TransferSizes}
+import freechips.rocketchip.tilelink.{TLClientNode, TLClientParameters,
+  TLMasterParameters, TLMasterPortParameters, TLArbiter, TLMessages}
+
+// memory request in word granularity(load, mmio, lr/sc, atomics)
+class DCacheWordReq  extends DCacheBundle
+{
+  val cmd    = UInt(M_SZ.W)
+  val addr   = UInt(PAddrBits.W)
+  val data   = UInt(DataBits.W)
+  val mask   = UInt((DataBits/8).W)
+  val id     = UInt(reqIdWidth.W)
+  def dump() = {
+    XSDebug("DCacheWordReq: cmd: %x addr: %x data: %x mask: %x id: %d\n",
+      cmd, addr, data, mask, id)
+  }
+}
+
+// memory request in word granularity(store)
+class DCacheLineReq  extends DCacheBundle
+{
+  val cmd    = UInt(M_SZ.W)
+  val addr   = UInt(PAddrBits.W)
+  val data   = UInt((cfg.blockBytes * 8).W)
+  val mask   = UInt(cfg.blockBytes.W)
+  val id     = UInt(reqIdWidth.W)
+  def dump() = {
+    XSDebug("DCacheLineReq: cmd: %x addr: %x data: %x mask: %x id: %d\n",
+      cmd, addr, data, mask, id)
+  }
+}
+
+class DCacheWordResp extends DCacheBundle
+{
+  val data         = UInt(DataBits.W)
+  // cache req missed, send it to miss queue
+  val miss   = Bool()
+  // cache req nacked, replay it later
+  val replay = Bool()
+  val id     = UInt(reqIdWidth.W)
+  def dump() = {
+    XSDebug("DCacheWordResp: data: %x id: %d miss: %b replay: %b\n",
+      data, id, miss, replay)
+  }
+}
+
+class DCacheLineResp extends DCacheBundle
+{
+  val data   = UInt((cfg.blockBytes * 8).W)
+  // cache req missed, send it to miss queue
+  val miss   = Bool()
+  // cache req nacked, replay it later
+  val replay = Bool()
+  val id     = UInt(reqIdWidth.W)
+  def dump() = {
+    XSDebug("DCacheLineResp: data: %x id: %d miss: %b replay: %b\n",
+      data, id, miss, replay)
+  }
+}
+
+class Refill extends DCacheBundle
+{
+  val addr   = UInt(PAddrBits.W)
+  val data   = UInt((cfg.blockBytes * 8).W)
+  def dump() = {
+    XSDebug("Refill: addr: %x data: %x\n", addr, data)
+  }
+}
+
+class DCacheWordIO extends DCacheBundle
+{
+  val req  = DecoupledIO(new DCacheWordReq)
+  val resp = Flipped(DecoupledIO(new DCacheWordResp))
+}
+
+// used by load unit
+class DCacheLoadIO extends DCacheWordIO
+{
+  // kill previous cycle's req
+  val s1_kill  = Output(Bool())
+  // cycle 0: virtual address: req.addr
+  // cycle 1: physical address: s1_paddr
+  val s1_paddr   = Output(UInt(PAddrBits.W))
+}
+
+class DCacheLineIO extends DCacheBundle
+{
+  val req  = DecoupledIO(new DCacheLineReq )
+  val resp = Flipped(DecoupledIO(new DCacheLineResp))
+}
+
+class DCacheToLsuIO extends DCacheBundle {
+  val load  = Vec(LoadPipelineWidth, Flipped(new DCacheLoadIO)) // for speculative load
+  val lsq = ValidIO(new Refill)  // refill to load queue, wake up load misses
+  val store = Flipped(new DCacheLineIO) // for sbuffer
+  val atomics  = Flipped(new DCacheWordIO)  // atomics reqs
+}
+
+class DCacheIO extends DCacheBundle {
+  val lsu = new DCacheToLsuIO
+  val prefetch = DecoupledIO(new MissReq)
+}
+
+
+class DCache()(implicit p: Parameters) extends LazyModule with HasDCacheParameters {
+
+  val clientParameters = TLMasterPortParameters.v1(
+    Seq(TLMasterParameters.v1(
+      name = "dcache",
+      sourceId = IdRange(0, cfg.nMissEntries+1),
+      supportsProbe = TransferSizes(cfg.blockBytes)
+    ))
+  )
+
+  val clientNode = TLClientNode(Seq(clientParameters))
+
+  lazy val module = new DCacheImp(this)
+}
+
+
+class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParameters with HasXSLog {
+
+  val io = IO(new DCacheIO)
+
+  val (bus, edge) = outer.clientNode.out.head
+  require(bus.d.bits.data.getWidth == l1BusDataWidth, "DCache: tilelink width does not match")
+
+  //----------------------------------------
+  // core data structures
+  val dataArray = Module(new DuplicatedDataArray)
+  val metaArray = Module(new DuplicatedMetaArray)
+  /*
+  dataArray.dump()
+  metaArray.dump()
+  */
+
+
+  //----------------------------------------
+  // core modules
+  val ldu = Seq.fill(LoadPipelineWidth) { Module(new LoadPipe) }
+  val storeReplayUnit = Module(new StoreReplayQueue)
+  val atomicsReplayUnit = Module(new AtomicsReplayEntry)
+
+  val mainPipe  = Module(new MainPipe)
+  val missQueue  = Module(new MissQueue(edge))
+  val probeQueue = Module(new ProbeQueue(edge))
+  val wb         = Module(new WritebackUnit(edge))
+
+
+  //----------------------------------------
+  // meta array
+  val MetaWritePortCount = 1
+  val MainPipeMetaWritePort = 0
+  metaArray.io.write <> mainPipe.io.meta_write
+
+  // MainPipe contend MetaRead with Load 0
+  // give priority to Load
+  val MetaReadPortCount = 2
+  val LoadPipeMetaReadPort = 0
+  val MainPipeMetaReadPort = 1
+
+  val metaReadArb = Module(new Arbiter(new L1MetaReadReq, MetaReadPortCount))
+
+  metaReadArb.io.in(LoadPipeMetaReadPort) <> ldu(0).io.meta_read
+  metaReadArb.io.in(MainPipeMetaReadPort) <> mainPipe.io.meta_read
+
+  metaArray.io.read(0) <> metaReadArb.io.out
+
+  ldu(0).io.meta_resp    <>  metaArray.io.resp(0)
+  mainPipe.io.meta_resp  <>  metaArray.io.resp(0)
+
+  for (w <- 1 until LoadPipelineWidth) {
+    metaArray.io.read(w) <> ldu(w).io.meta_read
+    ldu(w).io.meta_resp <> metaArray.io.resp(w)
+  }
+
+  //----------------------------------------
+  // data array
+  val DataWritePortCount = 1
+  val MainPipeDataWritePort = 0
+
+  dataArray.io.write <> mainPipe.io.data_write
+
+  // give priority to load
+  val DataReadPortCount = 2
+  val LoadPipeDataReadPort = 0
+  val MainPipeDataReadPort = 1
+
+  val dataReadArb = Module(new Arbiter(new L1DataReadReq, DataReadPortCount))
+
+  dataReadArb.io.in(LoadPipeDataReadPort)  <> ldu(0).io.data_read
+  dataReadArb.io.in(MainPipeDataReadPort)  <> mainPipe.io.data_read
+
+  dataArray.io.read(0) <> dataReadArb.io.out
+
+  dataArray.io.resp(0) <> ldu(0).io.data_resp
+  dataArray.io.resp(0) <> mainPipe.io.data_resp
+
+  for (w <- 1 until LoadPipelineWidth) {
+    dataArray.io.read(w) <> ldu(w).io.data_read
+    dataArray.io.resp(w) <> ldu(w).io.data_resp
+  }
+
+  //----------------------------------------
+  // load pipe
+  // the s1 kill signal
+  // only lsu uses this, replay never kills
+  for (w <- 0 until LoadPipelineWidth) {
+    ldu(w).io.lsu <> io.lsu.load(w)
+
+    // replay and nack not needed anymore
+    // TODO: remove replay and nack
+    ldu(w).io.nack := false.B
+  }
+
+  //----------------------------------------
+  // store pipe and store miss queue
+  storeReplayUnit.io.lsu    <> io.lsu.store
+
+  //----------------------------------------
+  // atomics
+  // atomics not finished yet
+  io.lsu.atomics := DontCare
+  atomicsReplayUnit.io := DontCare
+
+  // sanity check
+  val atomicsReq = io.lsu.atomics.req
+
+  //----------------------------------------
+  // miss queue
+  val MissReqPortCount = LoadPipelineWidth + 1
+  val MainPipeMissReqPort = 0
+
+  // Request
+  val missReqArb = Module(new Arbiter(new MissReq, MissReqPortCount))
+
+  missReqArb.io.in(MainPipeMissReqPort) <> mainPipe.io.miss_req
+  for (w <- 0 until LoadPipelineWidth) { missReqArb.io.in(w + 1) <> ldu(w).io.miss_req }
+
+  missQueue.io.req <> missReqArb.io.out
+
+  // refill to load queue
+  io.lsu.lsq <> missQueue.io.refill
+
+  // tilelink stuff
+  bus.a <> missQueue.io.mem_acquire
+  bus.e <> missQueue.io.mem_finish
+
+  //----------------------------------------
+  // probe
+  probeQueue.io.mem_probe <> bus.b
+
+  //----------------------------------------
+  // mainPipe
+  val MainPipeReqPortCount = 4
+  val MissMainPipeReqPort = 0
+  val StoreMainPipeReqPort = 1
+  val AtomicsMainPipeReqPort = 2
+  val ProbeMainPipeReqPort = 3
+
+  val mainPipeReqArb = Module(new Arbiter(new MainPipeReq, MainPipeReqPortCount))
+  mainPipeReqArb.io.in(MissMainPipeReqPort)    <> missQueue.io.pipe_req
+  mainPipeReqArb.io.in(StoreMainPipeReqPort)   <> storeReplayUnit.io.pipe_req
+  mainPipeReqArb.io.in(AtomicsMainPipeReqPort) <> atomicsReplayUnit.io.pipe_req
+  mainPipeReqArb.io.in(ProbeMainPipeReqPort)   <> probeQueue.io.pipe_req
+
+  mainPipe.io.req <> mainPipeReqArb.io.out
+
+  missQueue.io.pipe_resp         <> mainPipe.io.miss_resp
+  storeReplayUnit.io.pipe_resp   <> mainPipe.io.store_resp
+  atomicsReplayUnit.io.pipe_resp <> mainPipe.io.amo_resp
+
+  probeQueue.io.lrsc_locked_block <> mainPipe.io.lrsc_locked_block
+
+  //----------------------------------------
+  // wb
+  // add a queue between MainPipe and WritebackUnit to reduce MainPipe stalls due to WritebackUnit busy
+  val wb_queue = Module(new Queue(new WritebackReq, cfg.nReleaseEntries, flow = true))
+  wb_queue.io.enq <> mainPipe.io.wb_req
+  wb.io.req <> wb_queue.io.deq
+  bus.c     <> wb.io.mem_release
+
+  // connect bus d 
+  missQueue.io.mem_grant.valid := false.B
+  missQueue.io.mem_grant.bits  := DontCare
+
+  wb.io.mem_grant.valid := false.B
+  wb.io.mem_grant.bits  := DontCare
+
+  // in L1DCache, we ony expect Grant[Data] and ReleaseAck
+  bus.d.ready := false.B
+  when (bus.d.bits.opcode === TLMessages.Grant || bus.d.bits.opcode === TLMessages.GrantData) {
+    missQueue.io.mem_grant <> bus.d
+  } .elsewhen (bus.d.bits.opcode === TLMessages.ReleaseAck) {
+    wb.io.mem_grant <> bus.d
+  } .otherwise {
+    assert (!bus.d.fire())
+  }
+
+
+  // dcache should only deal with DRAM addresses
+  when (bus.a.fire()) {
+    assert(bus.a.bits.address >= 0x80000000L.U)
+  }
+  when (bus.b.fire()) {
+    assert(bus.b.bits.address >= 0x80000000L.U)
+  }
+  when (bus.c.fire()) {
+    assert(bus.c.bits.address >= 0x80000000L.U)
+  }
+
+  io.prefetch.valid := missQueue.io.req.fire()
+  io.prefetch.bits := missQueue.io.req.bits
+}
--- a/src/main/scala/xiangshan/cache/ldu.scala
+++ b/src/main/scala/xiangshan/cache/ldu.scala
@@ -2,6 +2,7 @@ package xiangshan.cache

 import chisel3._
 import chisel3.util._
+import freechips.rocketchip.tilelink.ClientMetadata

 import utils.XSDebug

@@ -24,9 +25,6 @@ class LoadPipe extends DCacheModule
  })

  // LSU requests
-  // replayed req should never be nacked
-  assert(!(io.lsu.req.valid && io.lsu.req.bits.meta.replay && io.nack))
-
  // it you got nacked, you can directly passdown
  val not_nacked_ready = io.meta_read.ready && io.data_read.ready
  val nacked_ready     = true.B
@@ -73,54 +71,35 @@ class LoadPipe extends DCacheModule
  val s1_tag_eq_way = wayMap((w: Int) => meta_resp(w).tag === (get_tag(s1_addr))).asUInt
  val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && meta_resp(w).coh.isValid()).asUInt
  val s1_tag_match = s1_tag_match_way.orR
-  val s1_hit_meta = Mux1H(s1_tag_match_way, wayMap((w: Int) => meta_resp(w)))
-  val s1_hit_state = s1_hit_meta.coh
-
-  // replacement policy
-  val replacer = cacheParams.replacement
-  val s1_repl_way_en = UIntToOH(replacer.way)
-  val s1_repl_meta = Mux1H(s1_repl_way_en, wayMap((w: Int) => meta_resp(w)))
-  when (io.miss_req.fire()) {
-    replacer.miss
-  }

-  assert(!(s1_valid && s1_req.meta.replay && io.lsu.s1_kill),
-    "lsq tried to kill an replayed request!")
+  val s1_fake_meta = Wire(new L1Metadata)
+  s1_fake_meta.tag := get_tag(s1_addr)
+  s1_fake_meta.coh := ClientMetadata.onReset
+
+  // when there are no tag match, we give it a Fake Meta
+  // this simplifies our logic in s2 stage
+  val s1_hit_meta  = Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap((w: Int) => meta_resp(w))), s1_fake_meta)
+  val s1_hit_coh = s1_hit_meta.coh

  // stage 2
  val s2_req   = RegNext(s1_req)
  val s2_valid = RegNext(s1_valid && !io.lsu.s1_kill, init = false.B)
+  val s2_addr = RegNext(s1_addr)

  dump_pipeline_reqs("LoadPipe s2", s2_valid, s2_req)

-  val s2_addr = RegNext(s1_addr)
+  // hit, miss, nack, permission checking
  val s2_tag_match_way = RegNext(s1_tag_match_way)
  val s2_tag_match     = RegNext(s1_tag_match)

  val s2_hit_meta      = RegNext(s1_hit_meta)
-  val s2_hit_state     = RegNext(s1_hit_state)
-  val s2_has_permission = s2_hit_state.onAccess(s2_req.cmd)._1
-  val s2_new_hit_state  = s2_hit_state.onAccess(s2_req.cmd)._3
-
-  val s2_repl_meta     = RegNext(s1_repl_meta)
-  val s2_repl_way_en   = RegNext(s1_repl_way_en)
-
-  val s2_old_meta      = Mux(s2_tag_match, s2_hit_meta, s2_repl_meta)
-  val s2_way_en        = Mux(s2_tag_match, s2_tag_match_way, s2_repl_way_en)
-
-
-  // we not only need permissions
-  // we also require that state does not change on hit
-  // thus we require new_hit_state === old_hit_state
-  //
-  // If state changes on hit,
-  // we should treat it as not hit, and let mshr deal with it,
-  // since we can not write meta data on the main pipeline.
-  // It's possible that we had permission but state changes on hit:
-  // eg: write to exclusive but clean block
-  val s2_hit = s2_tag_match && s2_has_permission && s2_hit_state === s2_new_hit_state
-  // nacked or not
-  val s2_nack = Wire(Bool())
+  val s2_hit_coh     = RegNext(s1_hit_coh)
+  val s2_has_permission = s2_hit_coh.onAccess(s2_req.cmd)._1
+  val s2_new_hit_coh    = s2_hit_coh.onAccess(s2_req.cmd)._3
+
+  val s2_hit = s2_tag_match && s2_has_permission && s2_hit_coh === s2_new_hit_coh
+
+  // generate data
  val s2_data = Wire(Vec(nWays, UInt(encRowBits.W)))
  val data_resp = io.data_resp
  for (w <- 0 until nWays) {
@@ -138,15 +117,9 @@ class LoadPipe extends DCacheModule
  val s2_data_word =  s2_data_words(s2_word_idx)
  val s2_decoded = cacheParams.dataCode.decode(s2_data_word)
  val s2_data_word_decoded = s2_decoded.corrected
-  // annotate out this assertion
-  // when TLB misses, s2_hit may still be true
-  // which may cause unnecessary assertion
-  // assert(!(s2_valid && s2_hit && !s2_nack && s2_decoded.uncorrectable))
-

  // when req got nacked, upper levels should replay this request
-
-  // the same set is busy
+  // nacked or not
  val s2_nack_hit    = RegNext(s1_nack)
  // can no allocate mshr for load miss
  val s2_nack_no_mshr = io.miss_req.valid && !io.miss_req.ready
@@ -154,7 +127,7 @@ class LoadPipe extends DCacheModule
  // For now, we use DuplicatedDataArray, so no bank conflicts
  val s2_nack_data   = false.B

-  s2_nack   := s2_nack_hit || s2_nack_no_mshr || s2_nack_data
+  val s2_nack = s2_nack_hit || s2_nack_no_mshr || s2_nack_data

  // only dump these signals when they are actually valid
  dump_pipeline_valids("LoadPipe s2", "s2_hit", s2_valid && s2_hit)
@@ -163,19 +136,18 @@ class LoadPipe extends DCacheModule
  dump_pipeline_valids("LoadPipe s2", "s2_nack_no_mshr", s2_valid && s2_nack_no_mshr)

  // send load miss to miss queue
-  io.miss_req.valid          := s2_valid && !s2_nack_hit && !s2_nack_data && !s2_hit
-  io.miss_req.bits.cmd       := s2_req.cmd
-  io.miss_req.bits.addr      := get_block_addr(s2_addr)
-  io.miss_req.bits.tag_match := s2_tag_match
-  io.miss_req.bits.way_en    := s2_way_en
-  io.miss_req.bits.old_meta  := s2_old_meta
-  io.miss_req.bits.client_id := 0.U
+  io.miss_req.valid       := s2_valid && !s2_nack_hit && !s2_nack_data && !s2_hit
+  io.miss_req.bits        := DontCare
+  io.miss_req.bits.source := LOAD_SOURCE.U
+  io.miss_req.bits.cmd    := s2_req.cmd
+  io.miss_req.bits.addr   := get_block_addr(s2_addr)
+  io.miss_req.bits.coh    := s2_hit_coh

  // send back response
  val resp = Wire(ValidIO(new DCacheWordResp))
  resp.valid     := s2_valid
+  resp.bits      := DontCare
  resp.bits.data := s2_data_word_decoded
-  resp.bits.meta := s2_req.meta
  // on miss or nack, upper level should replay request
  // but if we successfully sent the request to miss queue
  // upper level does not need to replay request
@@ -188,8 +160,7 @@ class LoadPipe extends DCacheModule
  assert(!(resp.valid && !io.lsu.resp.ready))

  when (resp.valid) {
-    XSDebug(s"LoadPipe resp: data: %x id: %d replayed_req: %b miss: %b need_replay: %b\n",
-      resp.bits.data, resp.bits.meta.id, resp.bits.meta.replay, resp.bits.miss, resp.bits.replay)
+    resp.bits.dump()
  }

  // -------
@@ -197,8 +168,8 @@ class LoadPipe extends DCacheModule
  def dump_pipeline_reqs(pipeline_stage_name: String, valid: Bool,
    req: DCacheWordReq ) = {
      when (valid) {
-        XSDebug(s"$pipeline_stage_name cmd: %x addr: %x data: %x mask: %x id: %d replay: %b\n",
-          req.cmd, req.addr, req.data, req.mask, req.meta.id, req.meta.replay)
+        XSDebug("$pipeline_stage_name: ")
+        req.dump()
      }
  }


--- a/src/main/scala/xiangshan/cache/MainPipe.scala
+++ b/src/main/scala/xiangshan/cache/MainPipe.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+
+import freechips.rocketchip.tilelink.{ClientMetadata, ClientStates, TLPermissions}
+
+import utils.{XSDebug, OneHot}
+
+class MainPipeReq extends DCacheBundle
+{
+  // for request that comes from MissQueue
+  // does this req come from MissQueue
+  val miss = Bool()
+  // which MissQueueEntry send this req?
+  val miss_id = UInt(log2Up(cfg.nMissEntries).W)
+  // what permission are we granted with?
+  val miss_param = UInt(TLPermissions.bdWidth.W)
+
+  // for request that comes from MissQueue
+  // does this req come from Probe
+  val probe = Bool()
+  val probe_param = UInt(TLPermissions.bdWidth.W)
+
+  // request info
+  // reqs from MissQueue, Store, AMO use this
+  // probe does not use this
+  val source = UInt(sourceTypeWidth.W)
+  val cmd    = UInt(M_SZ.W)
+  // must be aligned to block
+  val addr   = UInt(PAddrBits.W)
+
+  // store
+  val store_data   = UInt((cfg.blockBytes * 8).W)
+  val store_mask   = UInt(cfg.blockBytes.W)
+
+  // which word does amo work on?
+  val word_idx = UInt(log2Up(cfg.blockBytes * 8 / DataBits).W)
+  val amo_data   = UInt(DataBits.W)
+  val amo_mask   = UInt((DataBits/8).W)
+
+  val id     = UInt(reqIdWidth.W)
+
+  def dump() = {
+    XSDebug("MainPipeReq: miss: %b miss_id: %d miss_param: %d probe: %b probe_param: %d source: %d cmd: %d addr: %x store_data: %x store_mask: %x word_idx: %d data: %x mask: %x id: %d\n",
+      miss, miss_id, miss_param, probe, probe_param, source, cmd, addr, store_data, store_mask, word_idx, amo_data, amo_mask, id)
+  }
+}
+
+class MainPipeResp extends DCacheBundle
+{
+  val id     = UInt(reqIdWidth.W)
+  // AMO resp data
+  val data   = UInt(DataBits.W)
+  val miss   = Bool()
+  val replay = Bool()
+  def dump() = {
+    XSDebug("MainPipeResp: id: %d data: %x miss: %b replay: %b\n",
+      id, data, miss, replay)
+  }
+}
+
+class MainPipe extends DCacheModule
+{
+  val io = IO(new DCacheBundle {
+    // req and resp
+    val req        = Flipped(DecoupledIO(new MainPipeReq))
+    val miss_req   = DecoupledIO(new MissReq)
+    val miss_resp  = ValidIO(new MainPipeResp)
+    val store_resp = ValidIO(new MainPipeResp)
+    val amo_resp   = ValidIO(new MainPipeResp)
+
+    // meta/data read/write
+    val data_read  = DecoupledIO(new L1DataReadReq)
+    val data_resp  = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
+    val data_write = DecoupledIO(new L1DataWriteReq)
+
+    val meta_read  = DecoupledIO(new L1MetaReadReq)
+    val meta_resp  = Input(Vec(nWays, new L1Metadata))
+    val meta_write = DecoupledIO(new L1MetaWriteReq)
+
+    // write back
+    val wb_req     = DecoupledIO(new WritebackReq)
+
+    // lrsc locked block should block probe
+    val lrsc_locked_block = Output(Valid(UInt(PAddrBits.W)))
+  })
+
+  // assign default value to output signals
+  io.req.ready        := false.B
+  io.miss_resp.valid  := false.B
+  io.store_resp.valid := false.B
+  io.amo_resp.valid   := false.B
+
+  io.data_read.valid  := false.B
+  io.data_write.valid := false.B
+  io.data_write.bits  := DontCare
+  io.meta_read.valid  := false.B
+  io.meta_write.valid := false.B
+  io.meta_write.bits  := DontCare
+
+  io.wb_req.valid     := false.B
+  io.wb_req.bits      := DontCare
+
+  io.lrsc_locked_block.valid := false.B
+  io.lrsc_locked_block.bits  := DontCare
+
+  // Pipeline
+  // TODO: add full bypass for meta and data, bypass should be based on block address match
+  val stall = Wire(Bool())
+  stall := DontCare
+
+  // --------------------------------------------------------------------------------
+  // stage 0
+  // read meta and data
+
+  // valid: this pipeline has valid req
+  // fire: req fired and will appear in next pipeline stage
+  val s0_valid = io.req.valid
+  val s0_fire = io.req.fire()
+  val s0_req = io.req.bits
+
+  val word_full_overwrite = Wire(Vec(blockRows, Bits(rowWords.W)))
+  for (i <- 0 until blockRows) {
+    word_full_overwrite(i) := VecInit((0 until rowWords) map { r =>
+      val rowMask = s0_req.store_mask((i + 1) * rowBytes - 1, i * rowBytes)
+      rowMask((r + 1) * wordBytes - 1, r * wordBytes).andR
+    }).asUInt
+  }
+  val row_full_overwrite = VecInit(word_full_overwrite.map(w => w.andR)).asUInt
+  val full_overwrite = row_full_overwrite.andR
+
+  // If req comes form MissQueue, it must be a full overwrite,
+  //   but we still need to read data array
+  //   since we may do replacement
+  // If it's a store(not from MissQueue):
+  //   If it's full mask, no need to read data array;
+  //   If it's partial mask, no need to read full masked words.
+  // If it's a AMO(not from MissQueue), only need to read the specific word.
+  // If it's probe, read it all.
+
+  // do not left out !s0_req.probe,
+  // if it's a probe, all data mask fields are useless
+  // don't worry about duplicate conditions
+  // backend tools will remove them
+  val miss_need_data  = s0_req.miss
+  val store_need_data = !s0_req.miss && !s0_req.probe && s0_req.source === STORE_SOURCE.U && !full_overwrite
+  val amo_need_data = !s0_req.miss && !s0_req.probe && s0_req.source === AMO_SOURCE.U
+  val probe_need_data = s0_req.probe
+
+  val need_data = miss_need_data || store_need_data || amo_need_data || probe_need_data
+
+  val meta_read = io.meta_read.bits
+  val data_read = io.data_read.bits
+
+  val s1_s0_set_conflict = Wire(Bool())
+  val s2_s0_set_conflict = Wire(Bool())
+  val set_conflict =  s1_s0_set_conflict || s2_s0_set_conflict
+
+  // sanity check
+  when (s0_fire) {
+    when (s0_req.miss) {
+      assert (full_overwrite)
+    }
+    // AMO not yet finished
+    assert (s0_req.source === AMO_SOURCE.U)
+    OneHot.checkOneHot(Seq(s0_req.miss, s0_req.probe))
+  }
+
+  val meta_ready = io.meta_read.ready
+  val data_ready = !need_data || io.data_read.ready
+  io.req.ready := meta_ready && data_ready && !set_conflict
+
+  io.meta_read.valid := io.req.valid && !set_conflict
+  io.data_read.valid := io.req.valid && need_data && !set_conflict
+
+  // Tag read for new requests
+  meta_read.idx    := get_idx(s0_req.addr)
+  meta_read.way_en := ~0.U(nWays.W)
+  meta_read.tag    := DontCare
+
+  // Data read for new requests
+  data_read.addr   := s0_req.addr
+  data_read.way_en := ~0.U(nWays.W)
+
+  val rowWordBits = log2Floor(rowWords)
+  val amo_row  = s0_req.word_idx >> rowWordBits
+  val amo_word = if (rowWordBits == 0) 0.U else s0_req.word_idx(rowWordBits - 1, 0)
+
+  val store_rmask = ~row_full_overwrite
+  val amo_rmask   = UIntToOH(amo_row)
+  val full_rmask  = ~0.U(blockRows.W)
+  val none_rmask  = 0.U(blockRows.W)
+
+  // generate wmask here and use it in stage 2
+  val store_wmask = word_full_overwrite
+  val amo_wmask   = WireInit(VecInit((0 until blockRows) map (i => 0.U(rowWords.W))))
+  amo_wmask(amo_row) := VecInit((0 until rowWords) map (w => w.U === amo_word)).asUInt
+  val full_wmask  = VecInit((0 until blockRows) map (i => ~0.U(rowWords.W)))
+  val none_wmask  = VecInit((0 until blockRows) map (i => 0.U(rowWords.W)))
+
+  data_read.rmask  := Mux(store_need_data, store_rmask,
+    Mux(amo_need_data, amo_rmask,
+      Mux(probe_need_data || miss_need_data, full_rmask, none_rmask)))
+
+  dump_pipeline_reqs("MainPipe s0", s0_valid, s0_req)
+
+
+  // --------------------------------------------------------------------------------
+  // stage 1
+  // read out meta, check hit or miss
+  // TODO: add stalling
+
+  val s1_valid = RegInit(false.B)
+  val s1_fire  = s1_valid
+  val s1_req = RegEnable(s0_req, s0_fire)
+
+  val s1_store_wmask = RegEnable(store_wmask, s0_fire)
+  val s1_amo_wmask   = RegEnable(amo_wmask, s0_fire)
+  val s1_full_wmask  = RegEnable(full_wmask, s0_fire)
+  val s1_none_wmask  = RegEnable(none_wmask, s0_fire)
+
+  s1_s0_set_conflict := s1_valid && get_idx(s1_req.addr) === get_idx(s0_req.addr)
+
+  when (s0_fire) { s1_valid := true.B }
+  when (!s0_fire && s1_fire) { s1_valid := false.B }
+
+  dump_pipeline_reqs("MainPipe s1", s1_valid, s1_req)
+
+  val meta_resp = io.meta_resp
+  // tag check
+  def wayMap[T <: Data](f: Int => T) = VecInit((0 until nWays).map(f))
+  val s1_tag_eq_way = wayMap((w: Int) => meta_resp(w).tag === (get_tag(s1_req.addr))).asUInt
+  val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && meta_resp(w).coh.isValid()).asUInt
+  val s1_tag_match = s1_tag_match_way.orR
+
+  val s1_fake_meta = Wire(new L1Metadata)
+  s1_fake_meta.tag := get_tag(s1_req.addr)
+  s1_fake_meta.coh := ClientMetadata.onReset
+
+  // when there are no tag match, we give it a Fake Meta
+  // this simplifies our logic in s2 stage
+  val s1_hit_meta  = Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap((w: Int) => meta_resp(w))), s1_fake_meta)
+  val s1_hit_coh = s1_hit_meta.coh
+
+  // replacement policy
+  val replacer = cacheParams.replacement
+  val s1_repl_way_en = UIntToOH(replacer.way)
+  val s1_repl_meta = Mux1H(s1_repl_way_en, wayMap((w: Int) => meta_resp(w)))
+  val s1_repl_coh = s1_repl_meta.coh
+
+  // for now, since we are using random replacement
+  // we only need to update replacement states after every valid replacement decision
+  // we only do replacement when we are true miss(not permission miss)
+  when (s1_fire) {
+    when (s1_req.miss && !s1_tag_match) {
+      replacer.miss
+    }
+  }
+
+
+  // --------------------------------------------------------------------------------
+  // stage 2
+  // check permissions
+  // read out data, do write/amo stuff
+  val s2_valid = RegInit(false.B)
+  val s2_fire  = s2_valid
+  val s2_req = RegEnable(s1_req, s1_fire)
+
+  val s2_store_wmask = RegEnable(s1_store_wmask, s1_fire)
+  val s2_amo_wmask   = RegEnable(s1_amo_wmask, s1_fire)
+  val s2_full_wmask  = RegEnable(s1_full_wmask, s1_fire)
+  val s2_none_wmask  = RegEnable(s1_none_wmask, s1_fire)
+
+  s2_s0_set_conflict := s2_valid && get_idx(s2_req.addr) === get_idx(s0_req.addr)
+
+  when (s1_fire) { s2_valid := true.B }
+  when (!s1_fire && s2_fire) { s2_valid := false.B }
+
+  dump_pipeline_reqs("MainPipe s2", s2_valid, s2_req)
+
+  val s2_tag_match_way = RegNext(s1_tag_match_way)
+  val s2_tag_match     = RegNext(s1_tag_match)
+
+  val s2_hit_meta      = RegNext(s1_hit_meta)
+  val s2_hit_coh     = RegNext(s1_hit_coh)
+  val s2_has_permission = s2_hit_coh.onAccess(s2_req.cmd)._1
+  val s2_new_hit_coh  = s2_hit_coh.onAccess(s2_req.cmd)._3
+
+  val s2_repl_meta     = RegNext(s1_repl_meta)
+  val s2_repl_coh    = RegNext(s1_repl_coh)
+  val s2_repl_way_en   = RegNext(s1_repl_way_en)
+
+  // only true miss request(not permission miss) need to do replacement
+  // we use repl meta when we really need to a replacement
+  val need_replacement = s2_req.miss && !s2_tag_match
+  val s2_way_en        = Mux(need_replacement, s2_repl_way_en, s2_tag_match_way)
+  val s2_meta          = Mux(need_replacement, s2_repl_meta,   s2_hit_meta)
+  val s2_coh         = Mux(need_replacement, s2_repl_coh,  s2_hit_coh)
+
+  // --------------------------------------------------------------------------------
+  // Permission checking
+  val miss_new_coh = s2_coh.onGrant(s2_req.cmd, s2_req.miss_param)
+  when (s2_valid) {
+    // permission checking for miss refill
+    when (s2_req.miss) {
+      // if miss refill req hits in dcache
+      // make sure it has enough permission to complete this cmd
+      assert (miss_new_coh.isValid())
+
+      when (s2_tag_match) {
+        // if miss refill req hits in dcache
+        // then the old permission should be lower than new permission
+        // otherwise we would not miss
+        assert (s2_hit_coh.state < miss_new_coh.state)
+      }
+    }
+  }
+
+  // Determine what state to go to based on Probe param
+  val (probe_has_dirty_data, probe_shrink_param, probe_new_coh) = s2_coh.onProbe(s2_req.probe_param)
+
+  // as long as we has permission
+  // we will treat it as a hit
+  // if we need to update meta from Trunk to Dirty
+  // go update it
+  val s2_hit = s2_tag_match && s2_has_permission
+  val s2_store_hit = s2_hit && !s2_req.miss && !s2_req.probe && s2_req.source === STORE_SOURCE.U
+  val s2_amo_hit   = s2_hit && !s2_req.miss && !s2_req.probe && s2_req.source === AMO_SOURCE.U
+
+  when (s2_valid) {
+    XSDebug("MainPipe: s2 s2_tag_match: %b s2_has_permission: %b s2_hit: %b need_replacement: %b s2_way_en: %x s2_state: %d\n",
+      s2_tag_match, s2_has_permission, s2_hit, need_replacement, s2_way_en, s2_coh.state)
+  }
+
+  // --------------------------------------------------------------------------------
+  // Write to MetaArray
+
+  // whether we need to update meta
+
+  // miss should always update meta
+  val miss_update_meta = s2_req.miss
+  val probe_update_meta = s2_req.probe && s2_tag_match && s2_coh =/= probe_new_coh
+  // store only update meta when it hits and needs to update Trunk to Dirty
+  val store_update_meta = s2_store_hit && s2_hit_coh =/= s2_new_hit_coh
+  val amo_update_meta = s2_amo_hit && s2_hit_coh =/= s2_new_hit_coh
+  val update_meta = miss_update_meta || probe_update_meta || store_update_meta || amo_update_meta
+
+  val new_coh = Mux(miss_update_meta, miss_new_coh,
+    Mux(probe_update_meta, probe_new_coh,
+      Mux(store_update_meta || amo_update_meta, s2_new_hit_coh, ClientMetadata.onReset)))
+
+  io.meta_write.valid         := s2_valid && update_meta
+  io.meta_write.bits.idx      := get_idx(s2_req.addr)
+  io.meta_write.bits.data.coh := new_coh
+  io.meta_write.bits.data.tag := get_tag(s2_req.addr)
+  io.meta_write.bits.way_en   := s2_way_en
+
+
+  // --------------------------------------------------------------------------------
+  // Write to DataArray
+  // Miss:
+  //   1. not store and not amo, data: store_data mask: store_mask(full_mask)
+  //   2. store, data: store_data mask: store_mask(full_mask)
+  //   3. amo, data: merge(store_data, amo_data, amo_mask) mask: store_mask(full_mask)
+  // 
+  // Probe: do not write data, DontCare
+  // Store hit: data: merge(s2_data, store_data, store_mask) mask: store_mask
+  // AMO hit: data: merge(s2_data, amo_data, amo_mask) mask: store_mask
+  // so we can first generate store data and then merge with amo_data
+
+  // generate write mask
+  val wmask = Mux(s2_req.miss, s2_full_wmask,
+      Mux(s2_store_hit, s2_store_wmask,
+      Mux(s2_amo_hit, s2_amo_wmask,
+        s2_none_wmask)))
+  val need_write_data = VecInit(wmask.map(w => w.andR)).asUInt.andR
+
+  // generate write data
+  val store_data_merged = Wire(Vec(blockRows, UInt(rowBits.W)))
+
+  def mergePutData(old_data: UInt, new_data: UInt, wmask: UInt): UInt = {
+    val full_wmask = FillInterleaved(8, wmask)
+    ((~full_wmask & old_data) | (full_wmask & new_data))
+  }
+
+  val s2_data = Mux1H(s2_way_en, io.data_resp)
+  val s2_data_decoded = (0 until blockRows) map { r =>
+    (0 until rowWords) map { w =>
+      val data = s2_data(r)(encWordBits * (w + 1) - 1, encWordBits * w)
+      val decoded = cacheParams.dataCode.decode(data)
+      assert(!(s2_valid && s2_hit && decoded.uncorrectable))
+      decoded.corrected
+    }
+  }
+
+  // TODO: deal with ECC errors
+  for (i <- 0 until blockRows) {
+    store_data_merged(i) := Cat((0 until rowWords).reverse map { w =>
+      val old_data = s2_data_decoded(i)(w)
+      val new_data = s2_req.store_data(rowBits * (i + 1) - 1, rowBits * i)(wordBits * (w + 1) - 1, wordBits * w)
+      val wmask = s2_req.store_mask(rowBytes * (i + 1) - 1, rowBytes * i)(wordBytes * (w + 1) - 1, wordBytes * w)
+      val store_data = mergePutData(old_data, new_data, wmask)
+      store_data
+    })
+  }
+
+  val amo_data_merged = Wire(Vec(blockRows, UInt(rowBits.W)))
+  for (i <- 0 until blockRows) {
+    amo_data_merged(i) := store_data_merged(i)
+  }
+  // TODO: do amo calculation
+  // and merge amo data
+  /*
+  for (i <- 0 until blockRows) {
+    store_data_merged(i) := Cat((0 until rowWords).reverse map { w =>
+      val old_data = store_data_merged(i)(w)
+      val wmask = Mux(s2_req.source === AMO_SOURCE.U && (s2_req.miss || s2_hit) && s2_req.word_idx === i.U, s2_req.amo_mask, 0.U)
+      val store_data = mergePutData(old_data, new_data, wmask)
+    })
+  }
+  */
+
+  // ECC encode data
+  val wdata_merged = Wire(Vec(blockRows, UInt(encRowBits.W)))
+  for (i <- 0 until blockRows) {
+    wdata_merged(i) := Cat((0 until rowWords).reverse map { w =>
+      val wdata = amo_data_merged(i)(wordBits * (w + 1) - 1, wordBits * w)
+      val wdata_encoded = cacheParams.dataCode.encode(wdata)
+      wdata_encoded
+    })
+  }
+
+  val data_write = io.data_write.bits
+  io.data_write.valid := s2_valid && need_write_data
+  data_write.rmask    := DontCare
+  data_write.way_en   := s2_way_en
+  data_write.addr     := s2_req.addr
+  data_write.wmask    := wmask
+  data_write.data     := wdata_merged
+
+  assert(!(io.data_write.valid && !io.data_write.ready))
+
+  // --------------------------------------------------------------------------------
+  // Writeback
+  // whether we need to write back a block
+  // TODO: add support for ProbePerm
+  // Now, we only deal with ProbeBlock
+  val miss_writeback  = need_replacement && s2_coh === ClientStates.Dirty
+  // even probe missed, we still need to use write back to send ProbeAck NtoN response
+  // val probe_writeback = s2_req.probe && s2_tag_match && s2_coh.state =/= probe_new_coh.state
+  val probe_writeback = s2_req.probe
+  val need_writeback  = miss_writeback || probe_writeback
+
+  val writeback_addr  = Cat(s2_meta.tag, get_idx(s2_req.addr)) << blockOffBits
+
+  val (_, miss_shrink_param, _) = s2_coh.onCacheControl(M_FLUSH)
+  val writeback_param = Mux(miss_writeback, miss_shrink_param, probe_shrink_param)
+
+  val writeback_data = s2_coh === ClientStates.Dirty
+
+  val wb_req = io.wb_req.bits
+  io.wb_req.valid  := s2_valid && need_writeback
+  wb_req.addr      := writeback_addr
+  wb_req.param     := writeback_param
+  wb_req.voluntary := miss_writeback
+  wb_req.hasData   := writeback_data
+  wb_req.data      := VecInit(s2_data_decoded.flatten).asUInt
+
+  assert(!(io.wb_req.valid && !io.wb_req.ready))
+
+  // --------------------------------------------------------------------------------
+  // send store/amo miss to miss queue
+  val store_amo_miss = !s2_req.miss && !s2_req.probe && !s2_hit && (s2_req.source === STORE_SOURCE.U || s2_req.source === AMO_SOURCE.U)
+  io.miss_req.valid           := s2_valid && store_amo_miss
+  io.miss_req.bits.source     := s2_req.source
+  io.miss_req.bits.cmd        := s2_req.cmd
+  io.miss_req.bits.addr       := s2_req.addr
+  io.miss_req.bits.store_data := s2_req.store_data
+  io.miss_req.bits.store_mask := s2_req.store_mask
+  io.miss_req.bits.word_idx   := s2_req.word_idx
+  io.miss_req.bits.amo_data   := s2_req.amo_data
+  io.miss_req.bits.amo_mask   := s2_req.amo_mask
+  io.miss_req.bits.coh        := s2_coh
+  io.miss_req.bits.id         := s2_req.id
+
+  // --------------------------------------------------------------------------------
+  // send response
+  val resp = Wire(new MainPipeResp)
+  // TODO: add amo data out
+  resp.data := DontCare
+  resp.id   := s2_req.id
+  resp.miss := store_amo_miss
+  resp.replay := io.miss_req.valid && !io.miss_req.ready
+
+  io.miss_resp.valid   := s2_valid && s2_req.miss
+  io.miss_resp.bits    := resp
+  io.miss_resp.bits.id := s2_req.miss_id
+
+  io.store_resp.valid  := s2_valid && s2_req.source === STORE_SOURCE.U
+  io.store_resp.bits   := resp
+
+  io.amo_resp.valid    := s2_valid && s2_req.source === AMO_SOURCE.U
+  io.amo_resp.bits     := resp
+
+  when (io.miss_resp.fire()) {
+    io.miss_resp.bits.dump()
+  }
+
+  when (io.store_resp.fire()) {
+    io.store_resp.bits.dump()
+  }
+
+  when (io.amo_resp.fire()) {
+    io.amo_resp.bits.dump()
+  }
+
+
+  // -------
+  // Debug logging functions
+  def dump_pipeline_reqs(pipeline_stage_name: String, valid: Bool, req: MainPipeReq) = {
+    when (valid) {
+      XSDebug(s"$pipeline_stage_name ")
+      req.dump()
+    }
+  }
+}
--- a/src/main/scala/xiangshan/cache/MissQueue.scala
+++ b/src/main/scala/xiangshan/cache/MissQueue.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+import chisel3.ExcitingUtils._
+
+import freechips.rocketchip.tilelink.{TLEdgeOut, TLBundleA, TLBundleD, TLBundleE, TLPermissions, TLArbiter, ClientMetadata}
+import utils.{HasTLDump, XSDebug, BoolStopWatch, OneHot}
+
+class MissReq extends DCacheBundle
+{
+  val source = UInt(sourceTypeWidth.W)
+  val cmd    = UInt(M_SZ.W)
+  // must be aligned to block
+  val addr   = UInt(PAddrBits.W)
+
+  // store
+  val store_data   = UInt((cfg.blockBytes * 8).W)
+  val store_mask   = UInt(cfg.blockBytes.W)
+
+  // which word does amo work on?
+  val word_idx = UInt(log2Up(blockWords).W)
+  val amo_data = UInt(DataBits.W)
+  val amo_mask = UInt((DataBits/8).W)
+
+  // coherence state
+  val coh = new ClientMetadata
+  val id  = UInt(reqIdWidth.W)
+
+  def dump() = {
+    XSDebug("MissReq source: %d cmd: %d addr: %x store_data: %x store_mask: %x word_idx: %d amo_data: %x amo_mask: %x coh: %d id: %d\n",
+      source, cmd, addr, store_data, store_mask, word_idx, amo_data, amo_mask, coh.state, id)
+  }
+}
+
+// One miss entry deals with one missed block
+class MissEntry(edge: TLEdgeOut) extends DCacheModule
+{
+  val io = IO(new Bundle {
+    // MSHR ID
+    val id = Input(UInt())
+
+    // client requests
+    val req_valid = Input(Bool())
+    // this entry is free and can be allocated to new reqs
+    val primary_ready = Output(Bool())
+    // this entry is busy, but it can merge the new req
+    val secondary_ready = Output(Bool())
+    // this entry is busy and it can not merge the new req
+    val secondary_reject = Output(Bool())
+    val req    = Input((new MissReq))
+    val refill = ValidIO(new Refill)
+
+    // bus
+    val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle))
+    val mem_grant   = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
+    val mem_finish  = DecoupledIO(new TLBundleE(edge.bundle))
+
+    val pipe_req  = DecoupledIO(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+  })
+
+  // MSHR:
+  // 1. receive req
+  // 2. send acquire req
+  // 3. receive grant resp
+  // 4. let main pipe do refill and replace
+  // 5. wait for resp
+  // 6. send finish to end the tilelink transaction
+  //    We only send finish after data is written into cache.
+  //    This prevents L2 from probing the block down.
+  //    See Tilelink spec 1.8.1 page 69
+  //    A slave should not issue a Probe if there is a pending GrantAck on the block. Once the Probe is
+  //    issued, the slave should not issue further Probes on that block until it receives a ProbeAck.
+  val s_invalid :: s_refill_req :: s_refill_resp :: s_main_pipe_req :: s_main_pipe_resp :: s_mem_finish :: Nil = Enum(6)
+
+  val state = RegInit(s_invalid)
+
+  // --------------------------------------------
+  // internal registers
+  val req = Reg(new MissReq)
+
+  // param of grant
+  val grant_param = Reg(UInt(TLPermissions.bdWidth.W))
+
+  // recording the source/sink info from Grant
+  // so that we can use it grantack
+  val grantack = Reg(Valid(new TLBundleE(edge.bundle)))
+
+  // should we refill the data to load queue to wake up any missed load?
+  val should_refill_data  = Reg(Bool())
+
+
+  // --------------------------------------------
+  // merge reqs
+  // see whether we can merge requests
+  // do not count s_invalid state in
+  // since we can not merge request at that state
+  val acquire_not_sent = state === s_refill_req && !io.mem_acquire.ready
+  val data_not_refilled = state === s_refill_req || state === s_refill_resp
+
+  def can_merge(new_req: MissReq): Bool = {
+    // caution: do not merge with AMO
+    // we can not do amoalu calculation in MissQueue
+    // so, we do not know the result after AMO calculation
+    // so do not merge with AMO
+
+    // before read acquire is fired, we can merge read or write
+    val before_read_sent = acquire_not_sent && req.source === LOAD_SOURCE.U && (new_req.source === LOAD_SOURCE.U || new_req.source === STORE_SOURCE.U)
+    // before read/write refills data to LoadQueue, we can merge any read
+    val before_data_refill = data_not_refilled && (req.source === LOAD_SOURCE.U || req.source === STORE_SOURCE.U) && new_req.source === LOAD_SOURCE.U
+
+    before_read_sent || before_data_refill
+  }
+
+  def should_merge(new_req: MissReq): Bool = {
+    val block_match = req.addr === new_req.addr
+    block_match && can_merge(new_req)
+  }
+
+  def should_reject(new_req: MissReq): Bool = {
+    val block_match = req.addr === new_req.addr
+    // do not reject any req when we are in s_invalid
+    block_match && !can_merge(new_req) && state =/= s_invalid
+  }
+
+  io.primary_ready    := state === s_invalid
+  io.secondary_ready  := should_merge(io.req)
+  io.secondary_reject := should_reject(io.req)
+
+  // should not allocate, merge or reject at the same time
+  // one at a time
+  OneHot.checkOneHot(Seq(io.primary_ready, io.secondary_ready, io.secondary_reject))
+
+
+  // --------------------------------------------
+  // assign default values to output signals
+  io.refill.valid := false.B
+  io.refill.bits  := DontCare
+
+  io.mem_acquire.valid   := false.B
+  io.mem_acquire.bits    := DontCare
+  io.mem_grant.ready     := false.B
+  io.mem_finish.valid    := false.B
+  io.mem_finish.bits     := DontCare
+
+  io.pipe_req.valid := false.B
+  io.pipe_req.bits  := DontCare
+
+  when (state =/= s_invalid) {
+    XSDebug("entry: %d state: %d\n", io.id, state)
+    req.dump()
+  }
+
+
+  // --------------------------------------------
+  // State Machine
+
+  // --------------------------------------------
+  // receive requests
+  // primary request: allocate for a new request
+  when (io.req_valid && io.primary_ready) {
+    assert (state === s_invalid)
+
+    // re init some fields
+    req := io.req
+    grantack.valid := false.B
+    // only miss req from load needs a refill to LoadQueue
+    should_refill_data := io.req.source === LOAD_SOURCE.U
+
+    state := s_refill_req
+  }
+
+  // secondary request: merge with existing request
+  when (io.req_valid && io.secondary_ready) {
+    // The merged reqs should never have higher permissions
+    // which means the cache silently upgrade the permission of our block
+    // without merge with this miss queue request!
+    // Either our req come in with stale meta, or the req that upgrade the permission does not merge with this req.
+    // Both cases are bugs of DCache.
+    //
+    // DCache can silently drop permission(eg, probed or evicted)
+    // it should never silently upgrade permissions.
+    //
+    // TODO: please check Tilelink Metadata.scala
+    // and make sure that lower permission are encoded as smaller number
+    assert (io.req.coh.state <= req.coh.state)
+    // use the most uptodate meta
+    req.coh := io.req.coh
+
+    // when merging with store
+    // we should remember its info into our req
+    // or we will not be able to replay store
+    when (io.req.source === STORE_SOURCE.U) {
+      req := io.req
+    }
+
+    should_refill_data := io.req.source === LOAD_SOURCE.U
+  }
+
+
+  // --------------------------------------------
+  // refill
+  when (state === s_refill_req) {
+
+    val grow_param = req.coh.onAccess(req.cmd)._2
+
+    // for full overwrite, we can use AcquirePerm to save memory bandwidth
+    val full_overwrite = req.source === STORE_SOURCE.U && req.store_mask.andR
+
+    val acquireBlock = edge.AcquireBlock(
+      fromSource      = io.id,
+      toAddress       = req.addr,
+      lgSize          = (log2Up(cfg.blockBytes)).U,
+      growPermissions = grow_param)._2
+    val acquirePerm = edge.AcquirePerm(
+      fromSource      = io.id,
+      toAddress       = req.addr,
+      lgSize          = (log2Up(cfg.blockBytes)).U,
+      growPermissions = grow_param)._2
+
+    io.mem_acquire.valid := true.B
+    io.mem_acquire.bits := Mux(full_overwrite, acquirePerm, acquireBlock)
+
+    when (io.mem_acquire.fire()) {
+      state := s_refill_resp
+    }
+  }
+
+  val (_, _, refill_done, refill_count) = edge.count(io.mem_grant)
+
+  // raw data
+  val refill_data = Reg(Vec(blockRows, UInt(rowBits.W)))
+  val new_data    = Wire(Vec(blockRows, UInt(rowBits.W)))
+  val new_mask    = Wire(Vec(blockRows, UInt(rowBytes.W)))
+
+  for (i <- 0 until blockRows) {
+    new_data(i) := req.store_data(rowBits * (i + 1) - 1, rowBits * i)
+    // we only need to merge data for Store
+    new_mask(i) := Mux(req.source === STORE_SOURCE.U,
+      req.store_mask(rowBytes * (i + 1) - 1, rowBytes * i), 0.U(rowBytes.W))
+  }
+
+  def mergePutData(old_data: UInt, new_data: UInt, wmask: UInt): UInt = {
+    val full_wmask = FillInterleaved(8, wmask)
+    ((~full_wmask & old_data) | (full_wmask & new_data))
+  }
+
+  when (state === s_refill_resp) {
+    io.mem_grant.ready := true.B
+    when (io.mem_grant.fire()) {
+      when (edge.hasData(io.mem_grant.bits)) {
+        refill_data(refill_count) := mergePutData(io.mem_grant.bits.data, new_data(refill_count), new_mask(refill_count))
+      } .otherwise {
+        // when we only acquire perm, not data
+        // use Store's data
+        for (i <- 0 until blockRows) {
+          refill_data(i) := new_data(i)
+        }
+      }
+    }
+
+    when (refill_done) {
+      grantack.valid := edge.isRequest(io.mem_grant.bits)
+      grantack.bits := edge.GrantAck(io.mem_grant.bits)
+      grant_param := io.mem_grant.bits.param
+
+      state := s_main_pipe_req
+    }
+  }
+
+  io.refill.valid := RegNext(state === s_refill_resp && refill_done && should_refill_data)
+  io.refill.bits.addr := req.addr
+  io.refill.bits.data := refill_data.asUInt
+
+  when (state === s_main_pipe_req) {
+    io.pipe_req.valid := true.B
+    val pipe_req = io.pipe_req.bits
+    pipe_req.miss := true.B
+    pipe_req.miss_id := io.id
+    pipe_req.miss_param := grant_param
+
+    pipe_req.probe := false.B
+    pipe_req.probe_param := DontCare
+
+    pipe_req.source := req.source
+    pipe_req.cmd    := req.cmd
+    pipe_req.addr   := req.addr
+    pipe_req.store_data := refill_data.asUInt
+    // full overwrite
+    pipe_req.store_mask := Fill(cfg.blockBytes, "b1".U)
+    pipe_req.word_idx := req.word_idx
+    pipe_req.amo_data   := req.amo_data
+    pipe_req.amo_mask   := req.amo_mask
+    pipe_req.id     := req.id
+
+    when (io.pipe_req.fire()) {
+      state := s_main_pipe_resp
+    }
+  }
+
+  when (state === s_main_pipe_resp) {
+    when (io.pipe_resp.fire()) {
+      grantack.valid := false.B
+      state := s_mem_finish
+    }
+  }
+
+  when (state === s_mem_finish) {
+    io.mem_finish.valid := grantack.valid
+    io.mem_finish.bits  := grantack.bits
+
+    when (io.mem_finish.fire()) {
+      grantack.valid := false.B
+      state := s_invalid
+    }
+  }
+}
+
+
+class MissQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
+{
+  val io = IO(new Bundle {
+    val req    = Flipped(DecoupledIO(new MissReq))
+    val refill = ValidIO(new Refill)
+
+    val mem_acquire = Decoupled(new TLBundleA(edge.bundle))
+    val mem_grant   = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
+    val mem_finish  = Decoupled(new TLBundleE(edge.bundle))
+
+    val pipe_req  = DecoupledIO(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+  })
+
+  val pipe_req_arb = Module(new Arbiter(new MainPipeReq, cfg.nMissEntries))
+  val refill_arb   = Module(new Arbiter(new Refill, cfg.nMissEntries))
+
+  // dispatch req to MSHR
+  val primary_ready  = Wire(Vec(cfg.nMissEntries, Bool()))
+  val secondary_ready  = Wire(Vec(cfg.nMissEntries, Bool()))
+  val secondary_reject  = Wire(Vec(cfg.nMissEntries, Bool()))
+
+  // try merging with existing reqs
+  val merge = secondary_ready.asUInt.orR
+  val merge_idx = PriorityEncoder(secondary_ready)
+  // some req says the request can not be merged
+  val reject = secondary_reject.asUInt.orR
+  // allocate a new entry for this req
+  val allocate = !reject && !merge && primary_ready.asUInt.orR
+  val alloc_idx = PriorityEncoder(primary_ready)
+
+  // will this req be accepted
+  val accept = (merge || allocate) && !reject
+  // if it's accepted, which entry will it enter
+  val entry_idx = Mux(allocate, alloc_idx, merge_idx)
+
+  // for one block, their should be only one MSHR
+  // one block should not be stay in multiple MSHRs
+  // if we a req can not merge with existing reqs
+  // block it!
+  OneHot.checkOneHot(secondary_ready)
+  OneHot.checkOneHot(secondary_reject)
+  // should not merge and reject at the same time
+  OneHot.checkOneHot(Seq(merge, reject))
+
+  io.req.ready := accept
+  io.mem_grant.ready := false.B
+
+  val entries = (0 until cfg.nMissEntries) map { i =>
+    val entry = Module(new MissEntry(edge))
+
+    entry.io.id := i.U(log2Up(cfg.nMissEntries).W)
+
+    // entry req
+    entry.io.req_valid  := (i.U === entry_idx) && accept && io.req.valid
+    primary_ready(i)    := entry.io.primary_ready
+    secondary_ready(i)  := entry.io.secondary_ready
+    secondary_reject(i) := entry.io.secondary_reject
+    entry.io.req        := io.req.bits
+
+    // entry refill
+    refill_arb.io.in(i).valid := entry.io.refill.valid
+    refill_arb.io.in(i).bits  := entry.io.refill.bits
+
+    // pipe_req
+    pipe_req_arb.io.in(i)     <> entry.io.pipe_req
+
+    // pipe_req
+    entry.io.pipe_resp.valid  := false.B
+    entry.io.pipe_resp.bits   := DontCare
+    when (io.pipe_resp.bits.id === i.U) {
+      entry.io.pipe_resp <> io.pipe_resp
+    }
+
+    entry.io.mem_grant.valid := false.B
+    entry.io.mem_grant.bits  := DontCare
+    when (io.mem_grant.bits.source === i.U) {
+      entry.io.mem_grant <> io.mem_grant
+    }
+
+    /*
+    if (!env.FPGAPlatform) {
+      ExcitingUtils.addSource(
+        BoolStopWatch(
+          start = entry.io.req.fire(), 
+          stop = entry.io.resp.fire(),
+          startHighPriority = true),
+        "perfCntDCacheMissQueuePenaltyEntry" + Integer.toString(i, 10),
+        Perf
+      )
+    }
+    */
+
+    entry
+  }
+
+  io.refill.valid := refill_arb.io.out.valid
+  io.refill.bits  := refill_arb.io.out.bits
+  refill_arb.io.out.ready := true.B
+
+  // one refill at a time
+  OneHot.checkOneHot(refill_arb.io.in.map(r => r.valid))
+
+  TLArbiter.lowestFromSeq(edge, io.mem_acquire, entries.map(_.io.mem_acquire))
+  TLArbiter.lowestFromSeq(edge, io.mem_finish,  entries.map(_.io.mem_finish))
+
+  io.pipe_req <> pipe_req_arb.io.out
+
+
+  // print all input/output requests for debug purpose
+
+  when (io.req.fire()) {
+    io.req.bits.dump()
+    // sanity check
+    val source = io.req.bits.source
+    val cmd = io.req.bits.cmd
+    when (source === LOAD_SOURCE.U) {
+      assert (cmd === M_XRD)
+    }
+    when (source === STORE_SOURCE.U) {
+      assert (cmd === M_XWR)
+    }
+
+    when (source === AMO_SOURCE.U) {
+      assert (
+        cmd === M_XA_SWAP ||
+        cmd === M_XLR     ||
+        cmd === M_XSC     ||
+        cmd === M_XA_ADD  ||
+        cmd === M_XA_XOR  ||
+        cmd === M_XA_OR   ||
+        cmd === M_XA_AND  ||
+        cmd === M_XA_MIN  ||
+        cmd === M_XA_MAX  ||
+        cmd === M_XA_MINU ||
+        cmd === M_XA_MAXU)
+    }
+    // req addr must be aligned to block boundary
+    assert (io.req.bits.addr(blockOffBits - 1, 0) === 0.U)
+  }
+
+  when (io.refill.fire()) {
+    io.refill.bits.dump()
+  }
+
+  when (io.mem_acquire.fire()) {
+    XSDebug("mem_acquire ")
+    io.mem_acquire.bits.dump
+  }
+
+  when (io.mem_grant.fire()) {
+    XSDebug("mem_grant ")
+    io.mem_grant.bits.dump
+  }
+
+  when (io.mem_finish.fire()) {
+    XSDebug("mem_finish ")
+    io.mem_finish.bits.dump
+  }
+
+  if (!env.FPGAPlatform) {
+    ExcitingUtils.addSource(io.req.fire(), "perfCntDCacheMiss", Perf)
+  }
+}
--- a/src/main/scala/xiangshan/cache/Probe.scala
+++ b/src/main/scala/xiangshan/cache/Probe.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+
+import utils.XSDebug
+import freechips.rocketchip.tilelink.{TLEdgeOut, TLBundleB, TLMessages, TLPermissions}
+
+import utils.{HasTLDump, XSDebug}
+
+class ProbeReq extends DCacheBundle
+{
+  val source = UInt()
+  val opcode = UInt()
+  val addr   = UInt(PAddrBits.W)
+  val param  = UInt(TLPermissions.bdWidth.W)
+
+  def dump() = {
+    XSDebug("ProbeReq source: %d opcode: %d addr: %x param: %d\n",
+      source, opcode, addr, param)
+  }
+}
+
+class ProbeEntry extends DCacheModule {
+  val io = IO(new Bundle {
+    val req = Flipped(Decoupled(new ProbeReq))
+    val pipe_req  = DecoupledIO(new MainPipeReq)
+    val lrsc_locked_block = Input(Valid(UInt()))
+
+    // the block we are probing
+    val block_addr  = Output(Valid(UInt()))
+  })
+
+  val s_invalid :: s_pipe_req :: Nil = Enum(2)
+
+  val state = RegInit(s_invalid)
+
+  val req = Reg(new ProbeReq)
+
+  // assign default values to signals
+  io.req.ready      := false.B
+  io.pipe_req.valid := false.B
+  io.pipe_req.bits  := DontCare
+
+  io.block_addr.valid := state =/= s_invalid
+  io.block_addr.bits  := req.addr
+
+  when (state =/= s_invalid) {
+    XSDebug("state: %d\n", state)
+  }
+
+  when (state === s_invalid) {
+    io.req.ready := true.B
+    when (io.req.fire()) {
+      req := io.req.bits
+      state := s_pipe_req
+    }
+  }
+
+  when (state === s_pipe_req) {
+    val lrsc_blocked = io.lrsc_locked_block.valid && io.lrsc_locked_block.bits === req.addr
+    io.pipe_req.valid := !lrsc_blocked
+
+    val pipe_req = io.pipe_req.bits
+    pipe_req := DontCare
+    pipe_req.miss := false.B
+    pipe_req.probe := true.B
+    pipe_req.probe_param := req.param
+    pipe_req.addr   := req.addr
+
+    when (io.pipe_req.fire()) {
+      state := s_invalid
+    }
+  }
+}
+
+class ProbeQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
+{
+  val io = IO(new Bundle {
+    val mem_probe = Flipped(Decoupled(new TLBundleB(edge.bundle)))
+    val pipe_req  = DecoupledIO(new MainPipeReq)
+    val lrsc_locked_block = Input(Valid(UInt()))
+  })
+
+  val pipe_req_arb = Module(new Arbiter(new MainPipeReq, cfg.nProbeEntries))
+
+  // allocate a free entry for incoming request
+  val primary_ready  = Wire(Vec(cfg.nProbeEntries, Bool()))
+  val allocate = primary_ready.asUInt.orR
+  val alloc_idx = PriorityEncoder(primary_ready)
+
+  // translate to inner req
+  val req = Wire(new ProbeReq)
+  req.source := io.mem_probe.bits.source
+  req.opcode := io.mem_probe.bits.opcode
+  req.addr := io.mem_probe.bits.address
+  req.param := io.mem_probe.bits.param
+
+  io.mem_probe.ready := allocate
+
+  val entries = (0 until cfg.nProbeEntries) map { i =>
+    val entry = Module(new ProbeEntry)
+
+    // entry req
+    entry.io.req.valid := (i.U === alloc_idx) && allocate && io.mem_probe.valid
+    primary_ready(i)   := entry.io.req.ready
+    entry.io.req.bits  := req
+
+    // pipe_req
+    pipe_req_arb.io.in(i) <> entry.io.pipe_req
+
+    entry.io.lrsc_locked_block := io.lrsc_locked_block
+
+    entry
+  }
+
+  io.pipe_req <> pipe_req_arb.io.out
+
+  // print all input/output requests for debug purpose
+  when (io.mem_probe.valid) {
+    // before a probe finishes, L2 should not further issue probes on this block
+    val probe_conflict = VecInit(entries.map(e => e.io.block_addr.valid && e.io.block_addr.bits === io.mem_probe.bits.address)).asUInt.orR
+    assert (!probe_conflict)
+    // for now, we can only deal with ProbeBlock
+    assert (io.mem_probe.bits.opcode === TLMessages.Probe)
+  }
+}
--- a/src/main/scala/xiangshan/cache/StoreReplayUnit.scala
+++ b/src/main/scala/xiangshan/cache/StoreReplayUnit.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+
+import utils.XSDebug
+import bus.tilelink._
+
+class StoreReplayEntry extends DCacheModule
+{
+  val io = IO(new Bundle {
+    val id = Input(UInt())
+
+    val lsu  = Flipped(new DCacheLineIO)
+    val pipe_req  = Decoupled(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+
+    val block_addr  = Output(Valid(UInt()))
+  })
+
+  val s_invalid :: s_pipe_req :: s_pipe_resp :: s_resp :: Nil = Enum(4)
+  val state = RegInit(s_invalid)
+
+  val req = Reg(new DCacheLineReq)
+
+  // assign default values to output signals
+  io.lsu.req.ready     := state === s_invalid
+  io.lsu.resp.valid    := false.B
+  io.lsu.resp.bits     := DontCare
+
+  io.pipe_req.valid    := false.B
+  io.pipe_req.bits     := DontCare
+
+  io.block_addr.valid := state =/= s_invalid
+  io.block_addr.bits  := req.addr
+
+
+  when (state =/= s_invalid) {
+    XSDebug("StoreReplayEntry: %d state: %d block_addr: %x\n", io.id, state, io.block_addr.bits)
+  }
+
+  // --------------------------------------------
+  // s_invalid: receive requests
+  when (state === s_invalid) {
+    when (io.lsu.req.fire()) {
+      req   := io.lsu.req.bits
+      state := s_pipe_req
+    }
+  }
+
+  // --------------------------------------------
+  // replay
+  when (state === s_pipe_req) {
+    io.pipe_req.valid := true.B
+
+    val pipe_req = io.pipe_req.bits
+    pipe_req := DontCare
+    pipe_req.miss := false.B
+    pipe_req.probe := false.B
+    pipe_req.source := STORE_SOURCE.U
+    pipe_req.cmd    := req.cmd
+    pipe_req.addr   := req.addr
+    pipe_req.store_data  := req.data
+    pipe_req.store_mask  := req.mask
+    pipe_req.id := io.id
+
+    when (io.pipe_req.fire()) {
+      state := s_pipe_resp
+    }
+  }
+
+  when (state === s_pipe_resp) {
+    // when not miss
+    // everything is OK, simply send response back to sbuffer
+    // when miss and not replay
+    // wait for missQueue to handling miss and replaying our request
+    // when miss and replay
+    // req missed and fail to enter missQueue, manually replay it later
+    // TODO: add assertions:
+    // 1. add a replay delay counter?
+    // 2. when req gets into MissQueue, it should not miss any more
+    when (io.pipe_resp.fire()) {
+      when (io.pipe_resp.bits.miss) {
+        when (io.pipe_resp.bits.replay) {
+          state := s_pipe_req
+        }
+      } .otherwise {
+        state := s_resp
+      }
+    }
+  }
+
+  // --------------------------------------------
+  when (state === s_resp) {
+    io.lsu.resp.valid := true.B
+    io.lsu.resp.bits  := DontCare
+    io.lsu.resp.bits.id := req.id
+
+    when (io.lsu.resp.fire()) {
+      state := s_invalid
+    }
+  }
+
+  // debug output
+  when (io.lsu.req.fire()) {
+    XSDebug(s"StoreReplayEntryTransaction req %d\n", io.id)
+  }
+
+  when (io.lsu.resp.fire()) {
+    XSDebug(s"StoreReplayEntryTransaction resp %d\n", io.id)
+  }
+}
+
+
+class StoreReplayQueue extends DCacheModule
+{
+  val io = IO(new Bundle {
+    val lsu       = Flipped(new DCacheLineIO)
+    val pipe_req  = Decoupled(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+  })
+
+  val pipe_req_arb = Module(new Arbiter(new MainPipeReq, cfg.nStoreReplayEntries))
+  val resp_arb     = Module(new Arbiter(new DCacheLineResp, cfg.nStoreReplayEntries))
+
+  // allocate a free entry for incoming request
+  val primary_ready  = Wire(Vec(cfg.nStoreReplayEntries, Bool()))
+  val allocate = primary_ready.asUInt.orR
+  val alloc_idx = PriorityEncoder(primary_ready)
+
+  val req = io.lsu.req
+  req.ready := allocate
+
+  val entries = (0 until cfg.nStoreReplayEntries) map { i =>
+    val entry = Module(new StoreReplayEntry)
+
+    entry.io.id := i.U
+
+    // entry req
+    entry.io.lsu.req.valid := (i.U === alloc_idx) && allocate && req.valid
+    primary_ready(i)       := entry.io.lsu.req.ready
+    entry.io.lsu.req.bits  := req.bits
+
+    // lsu req and resp
+    resp_arb.io.in(i)  <> entry.io.lsu.resp
+
+    // replay req and resp
+    pipe_req_arb.io.in(i) <> entry.io.pipe_req
+
+    entry.io.pipe_resp.valid := (i.U === io.pipe_resp.bits.id) && io.pipe_resp.valid
+    entry.io.pipe_resp.bits  := io.pipe_resp.bits
+
+    entry
+  }
+
+  io.lsu.resp  <> resp_arb.io.out
+  io.pipe_req  <> pipe_req_arb.io.out
+
+  // sanity check
+  when (io.lsu.req.valid) {
+    assert(io.lsu.req.bits.cmd === M_XWR)
+    val block_conflict = VecInit(entries.map(e => e.io.block_addr.valid && e.io.block_addr.bits === io.lsu.req.bits.addr)).asUInt.orR
+    assert (!block_conflict)
+  }
+
+  // debug output
+  when (io.lsu.req.fire()) {
+    io.lsu.req.bits.dump()
+  }
+
+  when (io.lsu.resp.fire()) {
+    io.lsu.resp.bits.dump()
+  }
+
+  when (io.pipe_req.fire()) {
+    io.pipe_req.bits.dump()
+  }
+
+  when (io.pipe_resp.fire()) {
+    io.pipe_resp.bits.dump()
+  }
+}
--- a/src/main/scala/xiangshan/cache/WritebackUnit.scala
+++ b/src/main/scala/xiangshan/cache/WritebackUnit.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+import utils.XSDebug
+import freechips.rocketchip.tilelink.{TLBundleC, TLBundleD, TLEdgeOut, TLPermissions}
+
+class WritebackReq extends DCacheBundle {
+  val addr = UInt(PAddrBits.W)
+  val param  = UInt(TLPermissions.cWidth.W)
+  val voluntary = Bool()
+  val hasData = Bool()
+  val data = UInt((cfg.blockBytes * 8).W)
+
+  def dump() = {
+    XSDebug("WritebackReq addr: %x param: %d voluntary: %b hasData: %b data: %x\n",
+      addr, param, voluntary, hasData, data)
+  }
+}
+
+class WritebackUnit(edge: TLEdgeOut) extends DCacheModule {
+  val io = IO(new Bundle {
+    val req = Flipped(DecoupledIO(new WritebackReq))
+    val mem_release = DecoupledIO(new TLBundleC(edge.bundle))
+    val mem_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
+  })
+
+  // remaining beats
+  val remain = RegInit(0.U(refillCycles.W))
+  val remain_set = WireInit(0.U(refillCycles.W))
+  val remain_clr = WireInit(0.U(refillCycles.W))
+  remain := (remain | remain_set) & ~remain_clr
+
+  // used source id
+  // source id 0 is reserved for ProbeAck[Data]
+  val used = RegInit(0.U((cfg.nReleaseEntries - 1).W))
+  val used_set = WireInit(0.U((cfg.nReleaseEntries - 1).W))
+  val used_clr = WireInit(0.U((cfg.nReleaseEntries - 1).W))
+  used := (used | used_set) & ~used_clr
+
+  val busy = remain.orR
+  val all_used = used.andR
+
+  val req_reg  = Reg(new WritebackReq)
+  val req = Mux(busy, req_reg, io.req.bits)
+
+  // --------------------------------------------------------------------------------
+  // new req entering
+  // source to use for this transaction
+  val source = Reg(UInt())
+  io.req.ready := !busy && (!io.req.bits.voluntary || !all_used)
+  when (io.req.fire()) {
+    remain_set := Mux(io.req.bits.hasData, ~0.U(refillCycles.W), 1.U(refillCycles.W))
+    used_set   := Mux(io.req.bits.voluntary, PriorityEncoderOH(~used), 0.U)
+    // source 0 is reserved for ProbeAck[Data]
+    source     := Mux(io.req.bits.voluntary, PriorityEncoder(~used) + 1.U, 0.U)
+  }
+
+  // --------------------------------------------------------------------------------
+  // while there beats remaining to be sent, we keep sending
+  // which beat to send in this cycle?
+  val beat = PriorityEncoder(remain)
+
+  val beat_data = Wire(Vec(refillCycles, UInt(beatBits.W)))
+  for (i <- 0 until refillCycles) {
+    beat_data(i) := req.data((i + 1) * beatBits - 1, i * beatBits)
+  }
+
+  val probeResponse = edge.ProbeAck(
+    fromSource = source,
+    toAddress = req.addr,
+    lgSize = log2Ceil(cfg.blockBytes).U,
+    reportPermissions = req.param
+  )
+
+  val probeResponseData = edge.ProbeAck(
+    fromSource = source,
+    toAddress = req.addr,
+    lgSize = log2Ceil(cfg.blockBytes).U,
+    reportPermissions = req.param,
+    data = beat_data(beat)
+  )
+
+  val voluntaryRelease = edge.Release(
+    fromSource = source,
+    toAddress = req.addr,
+    lgSize = log2Ceil(cfg.blockBytes).U,
+    shrinkPermissions = req.param
+  )._2
+
+  val voluntaryReleaseData = edge.Release(
+    fromSource = source,
+    toAddress = req.addr,
+    lgSize = log2Ceil(cfg.blockBytes).U,
+    shrinkPermissions = req.param,
+    data = beat_data(beat)
+  )._2
+
+  io.mem_release.valid := busy
+  io.mem_release.bits  := Mux(req.voluntary,
+    Mux(req.hasData, voluntaryReleaseData, voluntaryRelease),
+    Mux(req.hasData, probeResponseData, probeResponse))
+
+  when (io.mem_release.fire()) { remain_clr := PriorityEncoderOH(remain) }
+
+  // --------------------------------------------------------------------------------
+  // receive ReleaseAck for Releases
+  // we are alway ready
+  // remember to assert any invalid grant
+  io.mem_grant.ready := used(io.mem_grant.bits.source - 1.U)
+  when (io.mem_grant.fire()) {
+    used_clr := UIntToOH(io.mem_grant.bits.source - 1.U)
+  }
+  
+  // print all input/output requests for debug purpose
+  // print req
+  when (io.req.fire()) {
+    io.req.bits.dump()
+  }
+}
--- a/src/main/scala/xiangshan/cache/atomics.scala
+++ b/src/main/scala/xiangshan/cache/atomics.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-
-import utils.{XSDebug}
-
-// this is a traditional cache pipeline:
-// it handles load/store/amo/lr,sc
-class AtomicsPipe extends DCacheModule
-{
-  val io = IO(new DCacheBundle{
-    val lsu       = Flipped(new DCacheWordIO)
-    val data_read  = DecoupledIO(new L1DataReadReq)
-    val data_resp  = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
-    val data_write = DecoupledIO(new L1DataWriteReq)
-    val meta_read  = DecoupledIO(new L1MetaReadReq)
-    val meta_resp  = Input(Vec(nWays, new L1Metadata))
-    val inflight_req_idxes       = Output(Vec(3, Valid(UInt())))
-    val inflight_req_block_addrs = Output(Vec(3, Valid(UInt())))
-    val block_probe_addr   = Output(Valid(UInt()))
-    val wb_invalidate_lrsc = Input(Valid(UInt()))
-
-    // send miss request to miss queue
-    val miss_req    = DecoupledIO(new MissReq)
-  })
-
-  // LSU requests
-  io.lsu.req.ready := io.meta_read.ready && io.data_read.ready
-  io.meta_read.valid := io.lsu.req.valid
-  io.data_read.valid := io.lsu.req.valid
-
-  val meta_read = io.meta_read.bits
-  val data_read = io.data_read.bits
-
-  // Tag read for new requests
-  meta_read.idx    := get_idx(io.lsu.req.bits.addr)
-  meta_read.way_en := ~0.U(nWays.W)
-  meta_read.tag    := DontCare
-  // Data read for new requests
-  data_read.addr   := io.lsu.req.bits.addr
-  data_read.way_en := ~0.U(nWays.W)
-  // only needs to read the specific beat
-  data_read.rmask  := UIntToOH(get_row(io.lsu.req.bits.addr))
-
-  // Pipeline
-  // ---------------------------------------
-  // stage 0
-  val s0_valid = io.lsu.req.fire()
-  val s0_req = io.lsu.req.bits
-
-  dump_pipeline_reqs("AtomicsPipe s0", s0_valid, s0_req)
-
-
-  // ---------------------------------------
-  // stage 1
-  val s1_req = RegNext(s0_req)
-  val s1_valid = RegNext(s0_valid, init = false.B)
-  val s1_addr = s1_req.addr
-  val s1_nack = false.B 
-
-  dump_pipeline_reqs("AtomicsPipe s1", s1_valid, s1_req)
-
-  // tag check
-  val meta_resp = io.meta_resp
-  def wayMap[T <: Data](f: Int => T) = VecInit((0 until nWays).map(f))
-  val s1_tag_eq_way = wayMap((w: Int) => meta_resp(w).tag === (get_tag(s1_addr))).asUInt
-  val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && meta_resp(w).coh.isValid()).asUInt
-  val s1_tag_match = s1_tag_match_way.orR
-  val s1_hit_meta = Mux1H(s1_tag_match_way, wayMap((w: Int) => meta_resp(w)))
-  val s1_hit_state = s1_hit_meta.coh
-
-  // replacement policy
-  val replacer = cacheParams.replacement
-  val s1_repl_way_en = UIntToOH(replacer.way)
-  val s1_repl_meta = Mux1H(s1_repl_way_en, wayMap((w: Int) => meta_resp(w)))
-  when (io.miss_req.fire()) {
-    replacer.miss
-  }
-
-
-  // ---------------------------------------
-  // stage 2
-  val s2_req   = RegNext(s1_req)
-  val s2_valid = RegNext(s1_valid, init = false.B)
-
-  dump_pipeline_reqs("AtomicsPipe s2", s2_valid, s2_req)
-
-  val s2_tag_match_way = RegNext(s1_tag_match_way)
-  val s2_tag_match     = s2_tag_match_way.orR
-
-  val s2_hit_meta      = RegNext(s1_hit_meta)
-  val s2_hit_state     = Mux1H(s2_tag_match_way, wayMap((w: Int) => RegNext(meta_resp(w).coh)))
-  val s2_has_permission = s2_hit_state.onAccess(s2_req.cmd)._1
-  val s2_new_hit_state  = s2_hit_state.onAccess(s2_req.cmd)._3
-
-  val s2_repl_meta     = RegNext(s1_repl_meta)
-  val s2_repl_way_en   = RegNext(s1_repl_way_en)
-
-  val s2_old_meta      = Mux(s2_tag_match, s2_hit_meta, s2_repl_meta)
-  val s2_way_en        = Mux(s2_tag_match, s2_tag_match_way, s2_repl_way_en)
-
-  // we not only need permissions
-  // we also require that state does not change on hit
-  // thus we require new_hit_state === old_hit_state
-  //
-  // If state changes on hit,
-  // we should treat it as not hit, and let mshr deal with it,
-  // since we can not write meta data on the main pipeline.
-  // It's possible that we had permission but state changes on hit:
-  // eg: write to exclusive but clean block
-  val s2_hit = s2_tag_match && s2_has_permission && s2_hit_state === s2_new_hit_state
-  val s2_nack = Wire(Bool())
-
-  // when req got nacked, upper levels should replay this request
-
-  // the same set is busy
-  val s2_nack_hit    = RegNext(s1_nack)
-  // can no allocate mshr for store miss
-  val s2_nack_no_mshr = io.miss_req.valid && !io.miss_req.ready
-  // Bank conflict on data arrays
-  // For now, we use DuplicatedDataArray, so no bank conflicts
-  val s2_nack_data   = false.B
-
-  s2_nack   := s2_nack_hit || s2_nack_no_mshr || s2_nack_data
-
-
-  // lr/sc
-  val debug_sc_fail_addr = RegInit(0.U)
-  val debug_sc_fail_cnt  = RegInit(0.U(8.W))
-
-  val lrsc_count = RegInit(0.U(log2Ceil(lrscCycles).W))
-  val lrsc_valid = lrsc_count > lrscBackoff.U
-  val lrsc_addr  = Reg(UInt())
-  val s2_lr = s2_req.cmd === M_XLR && !s2_nack
-  val s2_sc = s2_req.cmd === M_XSC && !s2_nack
-  val s2_lrsc_addr_match = lrsc_valid && lrsc_addr === get_block_addr(s2_req.addr)
-  val s2_sc_fail = s2_sc && !s2_lrsc_addr_match
-  val s2_sc_resp = Mux(s2_sc_fail, 1.U, 0.U)
-
-  // we have permission on this block
-  // but we can not finish in this pass
-  // we need to go to miss queue to update meta and set dirty first
-  val s2_set_dirty = s2_tag_match && s2_has_permission && s2_hit_state =/= s2_new_hit_state
-  // this sc should succeed, but we need to set dirty first
-  // do not treat it as a sc failure and reset lr sc counter
-  val sc_set_dirty = s2_set_dirty && !s2_nack && s2_sc && s2_lrsc_addr_match
-
-  when (s2_valid && !sc_set_dirty) {
-    when (s2_hit && !s2_nack && s2_lr) {
-      lrsc_count := (lrscCycles - 1).U
-      lrsc_addr := get_block_addr(s2_req.addr)
-    } .otherwise {
-      lrsc_count := 0.U
-    }
-  } .elsewhen (lrsc_count > 0.U) {
-    lrsc_count := lrsc_count - 1.U
-  }
-
-  io.block_probe_addr.valid := lrsc_valid
-  io.block_probe_addr.bits  := lrsc_addr
-
-  // when we release this block,
-  // we invalidate this reservation set
-  when (io.wb_invalidate_lrsc.valid) {
-    when (io.wb_invalidate_lrsc.bits === lrsc_addr) {
-      lrsc_count := 0.U
-    }
-
-    // when we release this block, there should be no matching lrsc inflight
-    assert (!(s2_valid && (s2_lr || s2_sc) && io.wb_invalidate_lrsc.bits === get_block_addr(s2_req.addr)))
-  }
-
-  when (s2_valid) {
-    when (s2_req.addr === debug_sc_fail_addr) {
-      when (s2_sc_fail) {
-        debug_sc_fail_cnt := debug_sc_fail_cnt + 1.U
-      } .elsewhen (s2_sc) {
-        debug_sc_fail_cnt := 0.U
-      }
-    } .otherwise {
-      when (s2_sc_fail) {
-        debug_sc_fail_addr := s2_req.addr
-        debug_sc_fail_cnt  := 1.U
-      }
-    }
-  }
-  assert(debug_sc_fail_cnt < 100.U, "L1DCache failed too many SCs in a row")
-
-  // only dump these signals when they are actually valid
-  dump_pipeline_valids("AtomicsPipe s2", "s2_hit", s2_valid && s2_hit)
-  dump_pipeline_valids("AtomicsPipe s2", "s2_nack", s2_valid && s2_nack)
-  dump_pipeline_valids("AtomicsPipe s2", "s2_nack_hit", s2_valid && s2_nack_hit)
-  dump_pipeline_valids("AtomicsPipe s2", "s2_nack_no_mshr", s2_valid && s2_nack_no_mshr)
-  dump_pipeline_valids("AtomicsPipe s2", "s2_nack_data", s2_valid && s2_nack_data)
-  when (s2_valid) {
-    XSDebug("lrsc_count: %d lrsc_valid: %b lrsc_addr: %x\n",
-      lrsc_count, lrsc_valid, lrsc_addr)
-    XSDebug("s2_lr: %b s2_sc: %b s2_lrsc_addr_match: %b s2_sc_fail: %b s2_sc_resp: %x\n",
-      s2_lr, s2_sc, s2_lrsc_addr_match, s2_sc_fail, s2_sc_resp)
-    XSDebug("debug_sc_fail_addr: %x debug_sc_fail_cnt: %d\n",
-      debug_sc_fail_addr, debug_sc_fail_cnt)
-  }
-
-  // load data gen
-  val s2_data = Wire(Vec(nWays, UInt(encRowBits.W)))
-  val data_resp = io.data_resp
-  for (w <- 0 until nWays) {
-    s2_data(w) := data_resp(w)(get_row(s2_req.addr))
-  }
-
-  val s2_data_muxed = Mux1H(s2_tag_match_way, s2_data)
-  // the index of word in a row, in case rowBits != wordBits
-  val s2_word_idx   = if (rowWords == 1) 0.U else s2_req.addr(log2Up(rowWords*wordBytes)-1, log2Up(wordBytes))
-  val s2_data_words = Wire(Vec(rowWords, UInt(encWordBits.W)))
-  for (w <- 0 until rowWords) {
-    s2_data_words(w) := s2_data_muxed(encWordBits * (w + 1) - 1, encWordBits * w)
-  }
-  val s2_data_word =  s2_data_words(s2_word_idx)
-  val s2_decoded = cacheParams.dataCode.decode(s2_data_word)
-  val s2_data_word_decoded = s2_decoded.corrected
-  assert(!(s2_valid && s2_hit && !s2_nack && s2_decoded.uncorrectable))
-
-
-  // send load miss to miss queue
-  io.miss_req.valid          := s2_valid && !s2_nack_hit && !s2_nack_data && !s2_hit
-  io.miss_req.bits.cmd       := s2_req.cmd
-  io.miss_req.bits.addr      := get_block_addr(s2_req.addr)
-  io.miss_req.bits.tag_match := s2_tag_match
-  io.miss_req.bits.way_en    := s2_way_en
-  io.miss_req.bits.old_meta  := s2_old_meta
-  io.miss_req.bits.client_id := s2_req.meta.id
-
-  val resp = Wire(ValidIO(new DCacheWordResp))
-  resp.valid        := s2_valid
-  resp.bits.data    := Mux(s2_sc, s2_sc_resp, s2_data_word)
-  resp.bits.meta    := s2_req.meta
-  // reuse this field to pass lr sc valid to commit
-  // nemu use this to see whether lr sc counter is still valid
-  resp.bits.meta.id := lrsc_valid
-  resp.bits.miss := !s2_hit || s2_nack
-  resp.bits.replay := resp.bits.miss && (!io.miss_req.fire() || s2_nack)
-
-  io.lsu.resp.valid := resp.valid
-  io.lsu.resp.bits := resp.bits
-  assert(!(resp.valid && !io.lsu.resp.ready))
-
-  when (resp.valid) {
-    XSDebug(s"AtomicsPipe resp: data: %x id: %d replayed_req: %b miss: %b need_replay: %b\n",
-      resp.bits.data, resp.bits.meta.id, resp.bits.meta.replay, resp.bits.miss, resp.bits.replay)
-  }
-
-
-  // ---------------------------------------
-  // s3: do data write
-  // Store/amo hits
-  val amoalu   = Module(new AMOALU(wordBits))
-  amoalu.io.mask := s2_req.mask
-  amoalu.io.cmd  := s2_req.cmd
-  amoalu.io.lhs  := s2_data_word_decoded
-  amoalu.io.rhs  := s2_req.data
-
-  val s3_req   = RegNext(s2_req)
-  val s3_valid = RegNext(s2_valid && s2_hit && isWrite(s2_req.cmd) && !s2_nack && !s2_sc_fail)
-  val s3_tag_match_way = RegNext(s2_tag_match_way)
-
-  val wdata_encoded = cacheParams.dataCode.encode(amoalu.io.out)
-  val s3_wdata = Reg(UInt())
-  s3_wdata := wdata_encoded
-
-  // write dcache if hit
-  // only needs to read the specific beat
-  val wmask = WireInit(VecInit((0 until blockRows) map (i => 0.U(rowWords.W))))
-  val wdata = WireInit(VecInit((0 until blockRows) map (i => Cat(
-    (0 until rowWords) map { w => s3_wdata }))))
-  wmask(get_row(s3_req.addr)) := ~0.U(rowWords.W)
-
-  val data_write = io.data_write.bits
-  io.data_write.valid := s3_valid
-  data_write.rmask    := DontCare
-  data_write.way_en   := s3_tag_match_way
-  data_write.addr     := s3_req.addr
-  data_write.wmask    := wmask
-  data_write.data     := wdata
-
-  assert(!(io.data_write.valid && !io.data_write.ready))
-
-  dump_pipeline_reqs("AtomicsPipe s3", s3_valid, s3_req)
-
-
-  // -------
-  // wire out signals for synchronization
-  io.inflight_req_idxes(0).valid := io.lsu.req.valid
-  io.inflight_req_idxes(1).valid := s1_valid
-  io.inflight_req_idxes(2).valid := s2_valid
-
-  io.inflight_req_idxes(0).bits  := get_idx(s0_req.addr)
-  io.inflight_req_idxes(1).bits  := get_idx(s1_req.addr)
-  io.inflight_req_idxes(2).bits  := get_idx(s2_req.addr)
-
-  io.inflight_req_block_addrs(0).valid := io.lsu.req.valid
-  io.inflight_req_block_addrs(1).valid := s1_valid
-  io.inflight_req_block_addrs(2).valid := s2_valid
-
-  io.inflight_req_block_addrs(0).bits  := get_block_addr(s0_req.addr)
-  io.inflight_req_block_addrs(1).bits  := get_block_addr(s1_req.addr)
-  io.inflight_req_block_addrs(2).bits  := get_block_addr(s2_req.addr)
-
-  // -------
-  // Debug logging functions
-  def dump_pipeline_reqs(pipeline_stage_name: String, valid: Bool,
-    req: DCacheWordReq ) = {
-      when (valid) {
-        XSDebug(s"$pipeline_stage_name cmd: %x addr: %x data: %x mask: %x id: %d replay: %b\n",
-          req.cmd, req.addr, req.data, req.mask, req.meta.id, req.meta.replay)
-      }
-  }
-
-  def dump_pipeline_valids(pipeline_stage_name: String, signal_name: String, valid: Bool) = {
-    when (valid) {
-      XSDebug(s"$pipeline_stage_name $signal_name\n")
-    }
-  }
-}
--- a/src/main/scala/xiangshan/cache/atomicsMissQueue.scala
+++ b/src/main/scala/xiangshan/cache/atomicsMissQueue.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-
-import utils.XSDebug
-
-// wraps around AtomicsPipe
-// when requests misse, send miss req to missQueue and replays reqs
-class AtomicsMissQueue extends DCacheModule
-{
-  val io = IO(new DCacheBundle {
-    val lsu         = Flipped(new DCacheWordIO)
-    val replay      = new DCacheWordIO
-    val miss_resp   = Flipped(ValidIO(new MissResp))
-    val miss_finish = DecoupledIO(new MissFinish)
-  })
-
-  val s_invalid :: s_replay_req :: s_replay_resp :: s_resp :: s_miss_resp :: s_miss_finish :: Nil = Enum(6)
-  val state = RegInit(s_invalid)
-  val id = 0.U
-
-  val req     = Reg(new DCacheWordReq)
-  val resp    = Reg(new DCacheWordResp)
-  val req_block_addr = get_block_addr(req.addr)
-  val reg_miss_resp = Reg(new MissResp)
-
-  // assign default values to output signals
-  io.lsu.req.ready     := state === s_invalid
-  io.lsu.resp.valid    := false.B
-  io.lsu.resp.bits     := DontCare
-
-  io.replay.req.valid  := false.B
-  io.replay.req.bits   := DontCare
-  io.replay.resp.ready := false.B
-
-  io.miss_finish.valid := false.B
-  io.miss_finish.bits  := DontCare
-
-  when (state =/= s_invalid) {
-    XSDebug("state: %d\n", state)
-  }
-
-  // --------------------------------------------
-  // s_invalid: receive requests
-  when (state === s_invalid) {
-    when (io.lsu.req.fire()) {
-      assert(!io.lsu.req.bits.meta.replay)
-      req   := io.lsu.req.bits
-      state := s_replay_req
-    }
-  }
-
-  // --------------------------------------------
-  // replay
-  when (state === s_replay_req) {
-    io.replay.req.valid := true.B
-    io.replay.req.bits  := req
-    when (io.replay.req.fire()) {
-      state := s_replay_resp
-    }
-  }
-
-  when (state === s_replay_resp) {
-    io.replay.resp.ready := true.B
-    when (io.replay.resp.fire()) {
-      // req missed
-      when (io.replay.resp.bits.miss) {
-        // replayed reqs should not miss
-        assert(!req.meta.replay)
-        // the req missed and did not enter mshr
-        // so replay it until it hits or enters mshr
-        when (io.replay.resp.bits.replay) {
-          state := s_replay_req
-        } .otherwise {
-          // the req missed and enters mshr
-          // wait for miss response
-          state := s_miss_resp
-        }
-      } .otherwise {
-        // req hits, everything OK
-        resp := io.replay.resp.bits
-        when (!req.meta.replay) {
-          state := s_resp
-        } .otherwise {
-          // if it's a replayed request
-          // we need to tell mshr, we are done
-          state := s_miss_finish
-        }
-      }
-    }
-  }
-
-  when (state === s_miss_resp) {
-    when (io.miss_resp.fire()) {
-      reg_miss_resp   := io.miss_resp.bits
-      // mark req as replayed req
-      req.meta.replay := true.B
-      state           := s_replay_req
-    }
-  }
-
-  when (state === s_miss_finish) {
-    io.miss_finish.valid          := true.B
-    io.miss_finish.bits.client_id := id
-    io.miss_finish.bits.entry_id  := reg_miss_resp.entry_id
-    when (io.miss_finish.fire()) {
-      state := s_resp
-    }
-  }
-
-  // --------------------------------------------
-  when (state === s_resp) {
-    io.lsu.resp.valid := true.B
-    io.lsu.resp.bits  := resp
-
-    when (io.lsu.resp.fire()) {
-      state := s_invalid
-    }
-  }
-
-  // debug output
-  when (io.lsu.req.fire()) {
-    XSDebug(s"io.lsu.req cmd: %x addr: %x data: %x mask: %x id: %d replayed_req: %b\n",
-      io.lsu.req.bits.cmd, io.lsu.req.bits.addr, io.lsu.req.bits.data, io.lsu.req.bits.mask, io.lsu.req.bits.meta.id, io.lsu.req.bits.meta.replay)
-  }
-
-  val replay = io.replay.req
-  when (replay.fire()) {
-    XSDebug(s"replay cmd: %x addr: %x data: %x mask: %x id: %d replayed_req: %b\n",
-      replay.bits.cmd, replay.bits.addr, replay.bits.data, replay.bits.mask, replay.bits.meta.id, replay.bits.meta.replay)
-  }
-
-  when (io.lsu.resp.fire()) {
-    XSDebug(s"io.lsu.resp: data: %x id: %d replayed_req: %b miss: %b need_replay: %b\n",
-      io.lsu.resp.bits.data, io.lsu.resp.bits.meta.id, io.lsu.resp.bits.meta.replay, io.lsu.resp.bits.miss, io.lsu.resp.bits.replay)
-  }
-
-  val miss_resp = io.miss_resp
-  XSDebug(miss_resp.fire(), "miss_resp client_id: %d entry_id: %d\n",
-    miss_resp.bits.client_id, miss_resp.bits.entry_id)
-
-  val miss_finish = io.miss_finish
-  XSDebug(miss_finish.fire(), "miss_finish client_id: %d entry_id: %d\n",
-    miss_finish.bits.client_id, miss_finish.bits.entry_id)
-
-  when (io.lsu.req.fire()) {
-    XSDebug(s"AtomicsMissEntryTransaction req 0\n")
-  }
-
-  when (io.lsu.resp.fire()) {
-    XSDebug(s"AtomicsMissEntryTransaction resp 0\n")
-  }
-}
--- a/src/main/scala/xiangshan/cache/dcacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/dcacheWrapper.scala
-package xiangshan.cache
-
-import chipsalliance.rocketchip.config.Parameters
-import chisel3._
-import chisel3.util._
-import xiangshan._
-import utils._
-import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, TransferSizes}
-import freechips.rocketchip.tilelink.{TLClientNode, TLClientParameters, TLMasterParameters, TLMasterPortParameters, TLArbiter}
-
-// Meta data for dcache requests
-// anything that should go with reqs and resps goes here
-class DCacheMeta extends DCacheBundle {
-  val id      = UInt(reqIdWidth.W)
-  val vaddr   = UInt(VAddrBits.W) // maybe we should use VAddrBits?
-  val paddr   = UInt(PAddrBits.W)
-  val uop     = new MicroOp //FIXME: opt data width
-  val mmio    = Bool()
-  val tlb_miss = Bool()
-  // dcache request id
-  // master uses id to correlate resps to reqs
-  // different master can allocate and free ids independently
-  // as long as they do not share resp  
-  val mask    = UInt((DataBits/8).W)
-  val replay  = Bool() // whether it's a replayed request?
-}
-
-// memory request in word granularity(load, mmio, lr/sc, atomics)
-class DCacheWordReq  extends DCacheBundle
-{
-  val cmd    = UInt(M_SZ.W)
-  val addr   = UInt(PAddrBits.W)
-  val data   = UInt(DataBits.W)
-  val mask   = UInt((DataBits/8).W)
-  val meta   = new DCacheMeta
-}
-
-// memory request in word granularity(store)
-class DCacheLineReq  extends DCacheBundle
-{
-  val cmd    = UInt(M_SZ.W)
-  val addr   = UInt(PAddrBits.W)
-  val data   = UInt((cfg.blockBytes * 8).W)
-  val mask   = UInt(cfg.blockBytes.W)
-  val meta   = new DCacheMeta
-}
-
-class DCacheWordResp extends DCacheBundle
-{
-  val data         = UInt(DataBits.W)
-  val meta         = new DCacheMeta
-  // cache req missed, send it to miss queue
-  val miss   = Bool()
-  // cache req nacked, replay it later
-  val replay = Bool()
-}
-
-class DCacheLineResp extends DCacheBundle
-{
-  val data   = UInt((cfg.blockBytes * 8).W)
-  val meta   = new DCacheMeta
-  // cache req missed, send it to miss queue
-  val miss   = Bool()
-  // cache req nacked, replay it later
-  val replay = Bool()
-}
-
-class Refill extends DCacheBundle
-{
-  val addr   = UInt(PAddrBits.W)
-  val data   = UInt((cfg.blockBytes * 8).W)
-}
-
-class DCacheWordIO extends DCacheBundle
-{
-  val req  = DecoupledIO(new DCacheWordReq)
-  val resp = Flipped(DecoupledIO(new DCacheWordResp))
-}
-
-// used by load unit
-class DCacheLoadIO extends DCacheWordIO
-{
-  // kill previous cycle's req
-  val s1_kill  = Output(Bool())
-  // cycle 0: virtual address: req.addr
-  // cycle 1: physical address: s1_paddr
-  val s1_paddr   = Output(UInt(PAddrBits.W))
-}
-
-class DCacheLineIO extends DCacheBundle
-{
-  val req  = DecoupledIO(new DCacheLineReq )
-  val resp = Flipped(DecoupledIO(new DCacheLineResp))
-}
-
-class DCacheToLsuIO extends DCacheBundle {
-  val load  = Vec(LoadPipelineWidth, Flipped(new DCacheLoadIO)) // for speculative load
-  val lsq = ValidIO(new Refill)  // refill to load queue, wake up load misses
-  val store = Flipped(new DCacheLineIO) // for sbuffer
-  val atomics  = Flipped(new DCacheWordIO)  // atomics reqs
-}
-
-class DCacheIO extends DCacheBundle {
-  val lsu = new DCacheToLsuIO
-  val prefetch = DecoupledIO(new MissReq)
-}
-
-
-class DCache()(implicit p: Parameters) extends LazyModule with HasDCacheParameters {
-
-  val clientParameters = TLMasterPortParameters.v1(
-    Seq(TLMasterParameters.v1(
-      name = "dcache",
-      sourceId = IdRange(0, cfg.nMissEntries+1),
-      supportsProbe = TransferSizes(cfg.blockBytes)
-    ))
-  )
-
-  val clientNode = TLClientNode(Seq(clientParameters))
-
-  lazy val module = new DCacheImp(this)
-}
-
-
-class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParameters with HasXSLog {
-
-  val io = IO(new DCacheIO)
-
-  val (bus, edge) = outer.clientNode.out.head
-  require(bus.d.bits.data.getWidth == l1BusDataWidth, "DCache: tilelink width does not match")
-
-  //----------------------------------------
-  // core data structures
-  val dataArray = Module(new DuplicatedDataArray)
-  val metaArray = Module(new DuplicatedMetaArray)
-  /*
-  dataArray.dump()
-  metaArray.dump()
-  */
-
-
-  //----------------------------------------
-  // core modules
-  val ldu = Seq.fill(LoadPipelineWidth) { Module(new LoadPipe) }
-  val stu = Module(new StorePipe)
-  val atomics = Module(new AtomicsPipe)
-  val storeMissQueue = Module(new StoreMissQueue)
-  val atomicsMissQueue = Module(new AtomicsMissQueue)
-  val missQueue = Module(new MissQueue(edge))
-  val wb = Module(new WritebackUnit(edge))
-  val prober = Module(new ProbeUnit(edge))
-
-
-  //----------------------------------------
-  // meta array
-  val MetaWritePortCount = 2
-  val MissQueueMetaWritePort = 0
-  val ProberMetaWritePort = 1
-  val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, MetaWritePortCount))
-
-  metaWriteArb.io.in(MissQueueMetaWritePort)    <> missQueue.io.meta_write
-  metaWriteArb.io.in(ProberMetaWritePort)       <> prober.io.meta_write
-
-  metaArray.io.write <> metaWriteArb.io.out
-
-  // To simplify port arbitration
-  // MissQueue, Prober and StorePipe all use port 0
-  // if contention got severe, considering load balancing on two ports?
-  val MetaReadPortCount = 4
-  val ProberMetaReadPort = 0
-  val StorePipeMetaReadPort = 1
-  val LoadPipeMetaReadPort = 2
-  val AtomicsPipeMetaReadPort = 3
-
-  val metaReadArb = Module(new Arbiter(new L1MetaReadReq, MetaReadPortCount))
-
-  metaReadArb.io.in(ProberMetaReadPort)       <> prober.io.meta_read
-  metaReadArb.io.in(StorePipeMetaReadPort)    <> stu.io.meta_read
-  metaReadArb.io.in(LoadPipeMetaReadPort)     <> ldu(0).io.meta_read
-  metaReadArb.io.in(AtomicsPipeMetaReadPort)  <> atomics.io.meta_read
-
-  metaArray.io.read(0) <> metaReadArb.io.out
-
-  prober.io.meta_resp    <>  metaArray.io.resp(0)
-  stu.io.meta_resp       <>  metaArray.io.resp(0)
-  ldu(0).io.meta_resp    <>  metaArray.io.resp(0)
-  atomics.io.meta_resp      <>  metaArray.io.resp(0)
-
-  for (w <- 1 until LoadPipelineWidth) {
-    metaArray.io.read(w) <> ldu(w).io.meta_read
-    ldu(w).io.meta_resp <> metaArray.io.resp(w)
-  }
-
-  //----------------------------------------
-  // data array
-  val DataWritePortCount = 3
-  val StorePipeDataWritePort = 0
-  val AtomicsPipeDataWritePort = 1
-  val MissQueueDataWritePort = 2
-
-  val dataWriteArb = Module(new Arbiter(new L1DataWriteReq, DataWritePortCount))
-
-  dataWriteArb.io.in(StorePipeDataWritePort) <> stu.io.data_write
-  dataWriteArb.io.in(MissQueueDataWritePort) <> missQueue.io.data_write
-  dataWriteArb.io.in(AtomicsPipeDataWritePort)  <> atomics.io.data_write
-
-  dataArray.io.write <> dataWriteArb.io.out
-
-  // To simplify port arbitration
-  // WritebackUnit and StorePipe use port 0
-  val DataReadPortCount = 4
-  val WritebackDataReadPort = 0
-  val StorePipeDataReadPort = 1
-  val LoadPipeDataReadPort = 2
-  val AtomicsPipeDataReadPort = 3
-
-  val dataReadArb = Module(new Arbiter(new L1DataReadReq, DataReadPortCount))
-
-  dataReadArb.io.in(WritebackDataReadPort) <> wb.io.data_req
-  dataReadArb.io.in(StorePipeDataReadPort) <> stu.io.data_read
-  dataReadArb.io.in(LoadPipeDataReadPort)  <> ldu(0).io.data_read
-  dataReadArb.io.in(AtomicsPipeDataReadPort) <> atomics.io.data_read
-
-  dataArray.io.read(0) <> dataReadArb.io.out
-  dataArray.io.resp(0) <> wb.io.data_resp
-  dataArray.io.resp(0) <> stu.io.data_resp
-  dataArray.io.resp(0) <> atomics.io.data_resp
-  dataArray.io.resp(0) <> ldu(0).io.data_resp
-
-  for (w <- 1 until LoadPipelineWidth) {
-    dataArray.io.read(w) <> ldu(w).io.data_read
-    dataArray.io.resp(w) <> ldu(w).io.data_resp
-  }
-
-  //----------------------------------------
-  // load pipe and load miss queue
-  // the s1 kill signal
-  // only lsu uses this, replay never kills
-  for (w <- 0 until LoadPipelineWidth) {
-    val load_w_nack = nack_load(io.lsu.load(w).req.bits.addr)
-    ldu(w).io.lsu.req <> io.lsu.load(w).req
-    ldu(w).io.lsu.s1_paddr <> io.lsu.load(w).s1_paddr
-    ldu(w).io.nack := load_w_nack
-    XSDebug(load_w_nack, s"LoadUnit $w nacked\n")
-
-    ldu(w).io.lsu.resp <> io.lsu.load(w).resp
-    ldu(w).io.lsu.s1_kill <> io.lsu.load(w).s1_kill
-    assert(!(io.lsu.load(w).req.fire() && io.lsu.load(w).req.bits.meta.replay), "LSU should not replay requests")
-  }
-
-  for (w <- 0 until LoadPipelineWidth) {
-    assert(!(io.lsu.load(w).req.fire() && io.lsu.load(w).req.bits.meta.mmio), "MMIO requests should not go to cache")
-    assert(!(io.lsu.load(w).req.fire() && io.lsu.load(w).req.bits.meta.tlb_miss), "TLB missed requests should not go to cache")
-  }
-
-  //----------------------------------------
-  // store pipe and store miss queue
-  storeMissQueue.io.lsu    <> io.lsu.store
-  /*
-  assert(!(storeMissQueue.io.replay.req.fire() && !storeMissQueue.io.replay.req.bits.meta.replay),
-    "StoreMissQueue should replay requests")
-  */
-  assert(!(io.lsu.store.req.fire() && io.lsu.store.req.bits.meta.replay),
-    "Sbuffer should not should replay requests")
-  assert(!(io.lsu.store.req.fire() && io.lsu.store.req.bits.meta.mmio),
-    "MMIO requests should not go to cache")
-  assert(!(io.lsu.store.req.fire() && io.lsu.store.req.bits.meta.tlb_miss),
-    "TLB missed requests should not go to cache")
-
-  val store_block = block_store(storeMissQueue.io.replay.req.bits.addr)
-  block_decoupled(storeMissQueue.io.replay.req, stu.io.lsu.req, store_block && !storeMissQueue.io.replay.req.bits.meta.replay)
-  storeMissQueue.io.replay.resp <> stu.io.lsu.resp
-  XSDebug(store_block, "StorePipe blocked\n")
-
-  //----------------------------------------
-  // atomics pipe
-  atomics.io.wb_invalidate_lrsc := wb.io.inflight_addr
-  atomicsMissQueue.io.lsu <> io.lsu.atomics
-  atomicsMissQueue.io.replay <> atomics.io.lsu
-
-  val atomics_block = block_atomics(atomicsMissQueue.io.replay.req.bits.addr)
-  block_decoupled(atomicsMissQueue.io.replay.req, atomics.io.lsu.req, atomics_block && !atomicsMissQueue.io.replay.req.bits.meta.replay)
-  XSDebug(atomics_block, "AtomicsPipe blocked\n")
-
-  // when atomics are in flight, there should be no load or store in flight
-  // so atomics and store should not show up at the same time
-  val atomics_inflight = VecInit(atomics.io.inflight_req_block_addrs map (entry => entry.valid)).reduce(_||_)
-  val store_inflight = VecInit(stu.io.inflight_req_block_addrs map (entry => entry.valid)).reduce(_||_)
-  assert(!(atomics_inflight && store_inflight))
-
-
-  // some other stuff
-  val atomicsReq = io.lsu.atomics.req
-  assert(!(atomicsReq.fire() && atomicsReq.bits.meta.replay),
-    "Atomics does not support request replay")
-  assert(!(atomicsReq.fire() && atomicsReq.bits.meta.mmio),
-    "MMIO requests should not go to cache")
-  assert(!(atomicsReq.fire() && atomicsReq.bits.meta.tlb_miss),
-    "TLB missed requests should not go to cache")
-
-  //----------------------------------------
-  // miss queue
-  require(LoadPipelineWidth == 2, "We hard code the number of load misses")
-  val loadMissQueueClientId_0  = 0.U(clientIdWidth.W)
-  val loadMissQueueClientId_1  = 1.U(clientIdWidth.W)
-  val storeMissQueueClientId   = 2.U(clientIdWidth.W)
-  val atomicsMissQueueClientId = 3.U(clientIdWidth.W)
-
-  // Request
-  val missReqArb = Module(new Arbiter(new MissReq, nClientMissQueues))
-
-  val missReq      = missQueue.io.req
-  val loadMissReq_0  = ldu(0).io.miss_req
-  val loadMissReq_1  = ldu(1).io.miss_req
-  val storeMissReq  = stu.io.miss_req
-  val atomicsMissReq  = atomics.io.miss_req
-
-  missReqArb.io.in(0) <> loadMissReq_0
-  missReqArb.io.in(0).bits.client_id := Cat(loadMissQueueClientId_0,
-    loadMissReq_0.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missReqArb.io.in(1) <> loadMissReq_1
-  missReqArb.io.in(1).bits.client_id := Cat(loadMissQueueClientId_1,
-    loadMissReq_0.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missReqArb.io.in(2).valid          := storeMissReq.valid
-  storeMissReq.ready                 := missReqArb.io.in(2).ready
-  missReqArb.io.in(2).bits           := storeMissReq.bits
-  missReqArb.io.in(2).bits.client_id := Cat(storeMissQueueClientId,
-    storeMissReq.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missReqArb.io.in(3).valid          := atomicsMissReq.valid
-  atomicsMissReq.ready               := missReqArb.io.in(3).ready
-  missReqArb.io.in(3).bits           := atomicsMissReq.bits
-  missReqArb.io.in(3).bits.client_id := Cat(atomicsMissQueueClientId,
-    atomicsMissReq.bits.client_id(entryIdMSB, entryIdLSB))
-
-  val miss_block = block_miss(missReqArb.io.out.bits.addr)
-  block_decoupled(missReqArb.io.out, missReq, miss_block)
-  XSDebug(miss_block, "MissQueue blocked\n")
-
-  // Response
-  // store and atomics wait for miss queue responses
-  val missResp        = missQueue.io.resp
-  val storeMissResp   = storeMissQueue.io.miss_resp
-  val atomicsMissResp = atomicsMissQueue.io.miss_resp
-
-  val clientId = missResp.bits.client_id(clientIdMSB, clientIdLSB)
-
-  val isStoreMissResp = clientId === storeMissQueueClientId
-  storeMissResp.valid := missResp.valid && isStoreMissResp
-  storeMissResp.bits  := missResp.bits
-  storeMissResp.bits.client_id := missResp.bits.client_id(entryIdMSB, entryIdLSB)
-
-  val isAtomicsMissResp = clientId === atomicsMissQueueClientId
-  atomicsMissResp.valid := missResp.valid && isAtomicsMissResp
-  atomicsMissResp.bits  := missResp.bits
-  atomicsMissResp.bits.client_id := missResp.bits.client_id(entryIdMSB, entryIdLSB)
-
-  // Finish
-  val missFinish        = missQueue.io.finish
-  val storeMissFinish   = storeMissQueue.io.miss_finish
-  val atomicsMissFinish = atomicsMissQueue.io.miss_finish
-
-  val missFinishArb = Module(new Arbiter(new MissFinish, 2))
-  missFinishArb.io.in(0).valid          := storeMissFinish.valid
-  storeMissFinish.ready                 := missFinishArb.io.in(0).ready
-  missFinishArb.io.in(0).bits.entry_id  := storeMissFinish.bits.entry_id
-  missFinishArb.io.in(0).bits.client_id := Cat(storeMissQueueClientId,
-      storeMissFinish.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missFinishArb.io.in(1).valid          := atomicsMissFinish.valid
-  atomicsMissFinish.ready                  := missFinishArb.io.in(1).ready
-  missFinishArb.io.in(1).bits.entry_id  := atomicsMissFinish.bits.entry_id
-  missFinishArb.io.in(1).bits.client_id := Cat(atomicsMissQueueClientId,
-    atomicsMissFinish.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missFinish                            <> missFinishArb.io.out
-
-  // refill to load queue
-  io.lsu.lsq <> missQueue.io.refill
-
-  // tilelink stuff
-  bus.a <> missQueue.io.mem_acquire
-  bus.e <> missQueue.io.mem_finish
-
-  when (bus.d.bits.source === cfg.nMissEntries.U) {
-    // This should be ReleaseAck
-    bus.d.ready := true.B
-    missQueue.io.mem_grant.valid := false.B
-    missQueue.io.mem_grant.bits  := DontCare
-  } .otherwise {
-    // This should be GrantData
-    missQueue.io.mem_grant <> bus.d
-  }
-
-
-  // sync with prober
-  missQueue.io.probe_wb_req.valid := prober.io.wb_req.fire()
-  missQueue.io.probe_wb_req.bits  := prober.io.wb_req.bits
-  missQueue.io.probe_active       := prober.io.inflight_req_idx
-
-  //----------------------------------------
-  // prober
-  prober.io.req.valid := bus.b.valid && !block_probe(get_block_addr(bus.b.bits.address))
-  bus.b.ready         := prober.io.req.ready && !block_probe(get_block_addr(bus.b.bits.address))
-  prober.io.req.bits  := bus.b.bits
-
-  //----------------------------------------
-  // wb
-  // 0 goes to prober, 1 goes to missQueue evictions
-  val wbArb = Module(new Arbiter(new WritebackReq(edge.bundle.sourceBits), 2))
-  wbArb.io.in(0)       <> prober.io.wb_req
-  wbArb.io.in(1)       <> missQueue.io.wb_req
-  wb.io.req            <> wbArb.io.out
-  missQueue.io.wb_resp := wb.io.resp
-  prober.io.wb_resp    := wb.io.resp
-  wb.io.mem_grant      := bus.d.fire() && bus.d.bits.source === cfg.nMissEntries.U
-
-  TLArbiter.lowestFromSeq(edge, bus.c, Seq(prober.io.rep, wb.io.release))
-
-  // dcache should only deal with DRAM addresses
-  when (bus.a.fire()) {
-    assert(bus.a.bits.address >= 0x80000000L.U)
-  }
-  when (bus.b.fire()) {
-    assert(bus.b.bits.address >= 0x80000000L.U)
-  }
-  when (bus.c.fire()) {
-    assert(bus.c.bits.address >= 0x80000000L.U)
-  }
-
-  io.prefetch.valid := missQueue.io.req.fire()
-  io.prefetch.bits := missQueue.io.req.bits
-
-  // synchronization stuff
-  def nack_load(addr: UInt) = {
-    val store_addr_matches = VecInit(stu.io.inflight_req_block_addrs map (entry => entry.valid && entry.bits === get_block_addr(addr)))
-    val store_addr_match = store_addr_matches.reduce(_||_)
-
-    val atomics_addr_matches = VecInit(atomics.io.inflight_req_block_addrs map (entry => entry.valid && entry.bits === get_block_addr(addr)))
-    val atomics_addr_match = atomics_addr_matches.reduce(_||_)
-
-    val prober_idx_match = prober.io.inflight_req_block_addr.valid && get_idx(prober.io.inflight_req_block_addr.bits) === get_idx(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-
-    store_addr_match || atomics_addr_match || prober_idx_match || miss_idx_match
-  }
-
-  def block_store(addr: UInt) = {
-    val prober_idx_match = prober.io.inflight_req_block_addr.valid && get_idx(prober.io.inflight_req_block_addr.bits) === get_idx(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-    prober_idx_match || miss_idx_match
-  }
-
-  def block_atomics(addr: UInt) = {
-    val prober_idx_match = prober.io.inflight_req_block_addr.valid && get_idx(prober.io.inflight_req_block_addr.bits) === get_idx(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-    prober_idx_match || miss_idx_match
-  }
-
-  def block_miss(addr: UInt) = {
-    val prober_idx_match = prober.io.inflight_req_idx.valid && prober.io.inflight_req_idx.bits === get_idx(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-
-    prober_idx_match || miss_idx_match
-  }
-
-  def block_probe(addr: UInt) = {
-    val store_idx_matches = VecInit(stu.io.inflight_req_block_addrs map (entry => entry.valid && get_idx(entry.bits) === get_idx(addr)))
-    val store_idx_match = store_idx_matches.reduce(_||_)
-
-    val atomics_idx_matches = VecInit(atomics.io.inflight_req_block_addrs map (entry => entry.valid && get_idx(entry.bits) === get_idx(addr)))
-    val atomics_idx_match = atomics_idx_matches.reduce(_||_)
-
-    val lrsc_addr_match = atomics.io.block_probe_addr.valid && atomics.io.block_probe_addr.bits === get_block_addr(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.block_probe_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-
-    // the missed req
-    val miss_req_idx_match = missReq.fire() && get_idx(missReq.bits.addr) === get_idx(addr)
-
-    store_idx_match || atomics_idx_match || lrsc_addr_match || miss_idx_match || miss_req_idx_match
-  }
-
-  def block_decoupled[T <: Data](source: DecoupledIO[T], sink: DecoupledIO[T], block_signal: Bool) = {
-    sink.valid   := source.valid && !block_signal
-    source.ready := sink.ready   && !block_signal
-    sink.bits    := source.bits
-  }
-}
--- a/src/main/scala/xiangshan/cache/dtlb.scala
+++ b/src/main/scala/xiangshan/cache/dtlb.scala
@@ -140,10 +140,10 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle {
      val insideLevel = level.getOrElse(0.U)
      val a = tag(vpnnLen*3-1, vpnnLen*2) === vpn(vpnnLen*3-1, vpnnLen*2)
      val b = tag(vpnnLen*2-1, vpnnLen*1) === vpn(vpnnLen*2-1, vpnnLen*1)
-      XSDebug(Mux(insideLevel.asBool, a&b, a), p"Hit superpage: hit:${Mux(insideLevel.asBool, a&b, a)} tag:${Hexadecimal(tag)} level:${insideLevel} data:${data} a:${a} b:${b} vpn:${Hexadecimal(vpn)}\n")("TlbEntrySuperpage")
+      XSDebug(Mux(insideLevel.asBool, a&b, a), p"Hit superpage: hit:${Mux(insideLevel.asBool, a&b, a)} tag:${Hexadecimal(tag)} level:${insideLevel} data:${data} a:${a} b:${b} vpn:${Hexadecimal(vpn)}\n")
      Mux(insideLevel.asBool, a&b, a)
    } else {
-      XSDebug(tag === vpn, p"Hit normalpage: hit:${tag === vpn} tag:${Hexadecimal(tag)} data:${data}  vpn:${Hexadecimal(vpn)}\n")("TlbEntryNormalpage")
+      XSDebug(tag === vpn, p"Hit normalpage: hit:${tag === vpn} tag:${Hexadecimal(tag)} data:${data}  vpn:${Hexadecimal(vpn)}\n")
      tag === vpn
    }
  }

--- a/src/main/scala/xiangshan/cache/missQueue.scala
+++ b/src/main/scala/xiangshan/cache/missQueue.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-import freechips.rocketchip.tilelink._
-import utils.{HasTLDump, XSDebug, BoolStopWatch}
-import chisel3.ExcitingUtils._
-
-class MissReq extends DCacheBundle
-{
-  val cmd  = UInt(M_SZ.W)
-  val addr  = UInt(PAddrBits.W)
-  val client_id  = UInt(missQueueClientIdWidth.W)
-  val tag_match = Bool()
-  val way_en    = Bits(nWays.W)
-  val old_meta  = new L1Metadata
-}
-
-class MissResp extends DCacheBundle
-{
-  val client_id = UInt(missQueueClientIdWidth.W)
-  val entry_id  = UInt(missQueueEntryIdWidth.W)
-}
-
-class MissFinish extends DCacheBundle
-{
-  val client_id  = UInt(missQueueClientIdWidth.W)
-  val entry_id  = UInt(missQueueEntryIdWidth.W)
-}
-
-
-// One miss entry deals with one missed block
-class MissEntry(edge: TLEdgeOut) extends DCacheModule
-{
-  val io = IO(new Bundle {
-    // MSHR ID
-    val id = Input(UInt())
-
-    // client requests
-    val req    = Flipped(DecoupledIO(new MissReq))
-    val resp   = DecoupledIO(new MissResp)
-    val finish = Flipped(DecoupledIO(new MissFinish))
-    // refill to load queue to wake up missed requests
-    val refill = ValidIO(new Refill)
-
-    // bus
-    val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle))
-    val mem_grant   = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
-    val mem_finish  = DecoupledIO(new TLBundleE(edge.bundle))
-
-    // write back
-    val wb_req      = DecoupledIO(new WritebackReq(edge.bundle.sourceBits))
-    val wb_resp     = Input(Bool())
-
-    // write meta and data
-    val meta_write  = DecoupledIO(new L1MetaWriteReq)
-    val data_write  = DecoupledIO(new L1DataWriteReq)
-
-    // for synchronization
-    val block_idx   = Output(Valid(UInt()))
-    val block_addr  = Output(Valid(UInt()))
-
-    val block_probe_idx   = Output(Valid(UInt()))
-    val block_probe_addr  = Output(Valid(UInt()))
-
-    // watch prober's write back requests
-    val probe_wb_req = Flipped(ValidIO(new WritebackReq(edge.bundle.sourceBits)))
-    val probe_active = Flipped(ValidIO(UInt()))
-  })
-
-  // MSHR:
-  // 1. get req
-  // 2. refill when necessary
-  // 3. writeback when necessary
-  // 4. update meta data
-  // 5. send response back to client
-  // 6. wait for client's finish
-  // 7. done
-  val s_invalid :: s_refill_req :: s_refill_resp :: s_mem_finish :: s_wait_probe_exit :: s_wb_req :: s_wb_resp :: s_data_write_req :: s_meta_write_req :: s_send_resp :: s_client_finish :: Nil = Enum(11)
-
-  val state = RegInit(s_invalid)
-
-  val req_reg = Reg(new MissReq)
-  val req = Mux(io.req.fire(), io.req.bits, req_reg)
-  val req_idx = get_idx(req.addr)
-  val req_tag = get_tag(req.addr)
-  val req_block_addr = get_block_addr(req.addr)
-
-  // meta read results
-  val req_tag_match = req.tag_match
-  val req_old_meta = req.old_meta
-  val req_way_en = req.way_en
-
-  // what permission to release for the old block?
-  val (_, shrink_param, coh_on_clear) = req_old_meta.coh.onCacheControl(M_FLUSH)
-
-  // what permission to acquire for the new block?
-  val new_coh = RegInit(ClientMetadata.onReset)
-  val grow_param = new_coh.onAccess(req.cmd)._2
-  val coh_on_grant = new_coh.onGrant(req.cmd, io.mem_grant.bits.param)
-
-  val (_, _, refill_done, refill_address_inc) = edge.addr_inc(io.mem_grant)
-
-  val grantack = Reg(Valid(new TLBundleE(edge.bundle)))
-  val refill_ctr  = Reg(UInt(log2Up(refillCycles).W))
-  val should_refill_data  = Reg(Bool())
-  val needs_writeback  = Reg(Bool())
-
-  // for read, we do not need to replay requests
-  // just refill data to load queue, and then, we can exit
-  // no need to walk through send_resp and client_finish state
-  //
-  // for store and amo
-  // we send back response when we have finished everything
-  // inform clients to replay requests
-  val no_replay   = Reg(Bool())
-
-  // assign default values to output signals
-  io.req.ready           := false.B
-  io.resp.valid          := false.B
-  io.resp.bits           := DontCare
-  io.finish.ready        := false.B
-
-  io.refill.valid := false.B
-  io.refill.bits  := DontCare
-
-  io.mem_acquire.valid   := false.B
-  io.mem_acquire.bits    := DontCare
-  io.mem_grant.ready     := false.B
-  io.mem_finish.valid    := false.B
-  io.mem_finish.bits     := DontCare
-
-  io.wb_req.valid        := false.B
-  io.wb_req.bits         := DontCare
-
-  io.meta_write.valid    := false.B
-  io.meta_write.bits     := DontCare
-
-  io.data_write.valid        := false.B
-  io.data_write.bits         := DontCare
-
-  io.block_idx.valid  := state =/= s_invalid
-  io.block_addr.valid := state =/= s_invalid
-  // break combinational loop
-  io.block_idx.bits   := get_idx(req_reg.addr)
-  io.block_addr.bits  := get_block_addr(req_reg.addr)
-
-  // to preserve forward progress, we allow probe when we are dealing with acquire/grant
-  io.block_probe_idx.valid  := state =/= s_invalid && state =/= s_refill_req && state =/= s_refill_resp
-  io.block_probe_addr.valid := state =/= s_invalid && state =/= s_refill_req && state =/= s_refill_resp
-  io.block_probe_idx.bits   := get_idx(req_reg.addr)
-  io.block_probe_addr.bits  := get_block_addr(req_reg.addr)
-
-  when (state =/= s_invalid) {
-    XSDebug("entry: %d state: %d\n", io.id, state)
-    XSDebug("entry: %d block_idx_valid: %b block_idx: %x block_addr_valid: %b block_addr: %x\n",
-      io.id, io.block_idx.valid, io.block_idx.bits, io.block_addr.valid, io.block_addr.bits)
-    XSDebug("entry: %d block_probe_idx_valid: %b block_probe_idx: %x block_probe_addr_valid: %b block_probe_addr: %x\n",
-      io.id, io.block_probe_idx.valid, io.block_probe_idx.bits, io.block_probe_addr.valid, io.block_probe_addr.bits)
-  }
-
-  // --------------------------------------------
-  // s_invalid: receive requests
-
-  // decision making
-  def decide_next_state(): UInt = {
-    val new_state = WireInit(s_invalid)
-    val old_coh   = req_old_meta.coh
-    val needs_wb = old_coh.onCacheControl(M_FLUSH)._1 // does the line we are evicting need to be written back
-    no_replay := req.cmd === M_XRD
-
-    when (req_tag_match) {
-      val (is_hit, _, coh_on_hit) = old_coh.onAccess(req.cmd)
-      when (is_hit) { // set dirty bit
-        // read should never go here
-        // we get here only when we need to set dirty bit
-        assert(isWrite(req.cmd))
-        // go update meta
-        new_coh     := coh_on_hit
-        new_state   := s_meta_write_req
-      } .otherwise { // upgrade permissions
-        new_coh     := old_coh
-        new_state   := s_refill_req
-      }
-    } .otherwise { // refill and writeback if necessary
-      new_coh     := ClientMetadata.onReset
-      should_refill_data := true.B
-      needs_writeback := needs_wb
-      // refill first to decrease load miss penalty
-      new_state   := s_refill_req
-    }
-    new_state
-  }
-
-  when (state === s_invalid) {
-    io.req.ready := true.B
-
-    when (io.req.fire()) {
-      grantack.valid := false.B
-      refill_ctr := 0.U
-      should_refill_data := false.B
-      needs_writeback := false.B
-      no_replay := false.B
-      req_reg := io.req.bits
-      state := decide_next_state()
-    }
-  }
-
-  // --------------------------------------------
-  // refill
-  when (state === s_refill_req) {
-    io.mem_acquire.valid := true.B
-    // TODO: Use AcquirePerm if just doing permissions acquire
-    // TODO: review this
-    io.mem_acquire.bits  := edge.AcquireBlock(
-      fromSource      = io.id,
-      toAddress       = (Cat(req_tag, req_idx) << blockOffBits).asUInt(),
-      lgSize          = (log2Up(cfg.blockBytes)).U,
-      growPermissions = grow_param)._2
-    when (io.mem_acquire.fire()) {
-      state := s_refill_resp
-    }
-  }
-
-  // ecc-encoded data
-  val refill_data = Reg(Vec(blockRows, UInt(encRowBits.W)))
-  // raw data
-  val refill_data_raw = Reg(Vec(blockRows, UInt(rowBits.W)))
-  when (state === s_refill_resp) {
-    io.mem_grant.ready := true.B
-
-    when (edge.hasData(io.mem_grant.bits)) {
-      when (io.mem_grant.fire()) {
-        // for AcquireBlock BtoT, we clear should_refill_data
-        // and expect response with no data(Grant, not GrantData)
-        // but block inclusive cache responded with a GrantData!
-        // so we temporarily removed this assertion
-        // we may consider using AcquirePerm BtoT for permission upgrade
-        // assert(should_refill_data)
-        refill_ctr := refill_ctr + 1.U
-        for (i <- 0 until beatRows) {
-          val row = io.mem_grant.bits.data(rowBits * (i + 1) - 1, rowBits * i)
-          refill_data((refill_ctr << log2Floor(beatRows)) + i.U) := Cat((0 until rowWords).reverse map { w =>
-            val word = row(wordBits * (w + 1) - 1, wordBits * w)
-            val word_encoded = cacheParams.dataCode.encode(word)
-            word_encoded
-          })
-          refill_data_raw((refill_ctr << log2Floor(beatRows)) + i.U) := row
-        }
-
-        when (refill_ctr === (refillCycles - 1).U) {
-          assert(refill_done, "refill not done!")
-        }
-      }
-    }
-
-    when (refill_done) {
-      grantack.valid := edge.isRequest(io.mem_grant.bits)
-      grantack.bits := edge.GrantAck(io.mem_grant.bits)
-      new_coh := coh_on_grant
-
-      state := s_mem_finish
-    }
-  }
-
-  // refill data to load queue
-  io.refill.valid := RegNext(state === s_refill_resp && refill_done &&
-    should_refill_data && no_replay)
-  io.refill.bits.addr := req_block_addr
-  io.refill.bits.data := refill_data_raw.asUInt
-
-  when (state === s_mem_finish) {
-    io.mem_finish.valid := grantack.valid
-    io.mem_finish.bits  := grantack.bits
-
-    when (io.mem_finish.fire()) {
-      grantack.valid := false.B
-      state := s_wait_probe_exit
-    }
-  }
-
-  // --------------------------------------------
-  // sync with probe
-  when (state === s_wait_probe_exit) {
-    // we only wait for probe, when prober is manipulating our set
-    val should_wait_for_probe_exit = io.probe_active.valid && io.probe_active.bits === req_idx
-    when (!should_wait_for_probe_exit) {
-      when (needs_writeback) {
-        // write back data
-        state := s_wb_req
-      } .otherwise {
-        // no need to write back
-        when (should_refill_data) {
-          // fill data into dcache
-          state := s_data_write_req
-        } otherwise {
-          // permission update only
-          state := s_meta_write_req
-        }
-      }
-    }
-  }
-
-
-  // during refill, probe may step in, it may release our blocks
-  // if it releases the block we are trying to acquire, we don't care, since we will get it back eventually
-  // but we need to know whether it releases the block we are trying to evict
-  val prober_writeback_our_block = (state === s_refill_req || state === s_refill_resp ||
-    state === s_mem_finish || state === s_wait_probe_exit) &&
-    io.probe_wb_req.valid && !io.probe_wb_req.bits.voluntary &&
-    io.probe_wb_req.bits.tag === req_old_meta.tag &&
-    io.probe_wb_req.bits.idx === req_idx &&
-    io.probe_wb_req.bits.way_en === req_way_en &&
-    needs_writeback
-
-  def onShrink(param: UInt): ClientMetadata = {
-    import freechips.rocketchip.tilelink.ClientStates._
-    import freechips.rocketchip.tilelink.TLPermissions._
-    val state = MuxLookup(param, Nothing, Seq(
-      TtoB   -> Branch,
-      TtoN   -> Nothing,
-      BtoN   -> Nothing))
-    ClientMetadata(state)
-  }
-
-  when (prober_writeback_our_block) {
-    req_reg.old_meta.coh := onShrink(io.probe_wb_req.bits.param)
-  }
-
-  // --------------------------------------------
-  // write back
-  when (state === s_wb_req) {
-    io.wb_req.valid          := true.B
-
-    io.wb_req.bits.tag       := req_old_meta.tag
-    io.wb_req.bits.idx       := req_idx
-    io.wb_req.bits.param     := shrink_param
-    io.wb_req.bits.way_en    := req_way_en
-    io.wb_req.bits.source    := io.id
-    io.wb_req.bits.voluntary := true.B
-    when (io.wb_req.fire()) {
-      state := s_wb_resp
-    }
-  }
-
-  when (state === s_wb_resp) {
-    when (io.wb_resp) {
-      state := s_data_write_req
-    }
-  }
-
-  // --------------------------------------------
-  // data write
-  when (state === s_data_write_req) {
-    io.data_write.valid        := true.B
-    io.data_write.bits.addr    := req_block_addr
-    io.data_write.bits.way_en  := req_way_en
-    io.data_write.bits.wmask   := VecInit((0 until blockRows) map (i => ~0.U(rowWords.W)))
-    io.data_write.bits.rmask   := DontCare
-    io.data_write.bits.data    := refill_data
-
-    when (io.data_write.fire()) {
-      state := s_meta_write_req
-    }
-  }
-
-  // --------------------------------------------
-  // meta write
-  when (state === s_meta_write_req) {
-    io.meta_write.valid         := true.B
-    io.meta_write.bits.idx      := req_idx
-    io.meta_write.bits.data.coh := new_coh
-    io.meta_write.bits.data.tag := req_tag
-    io.meta_write.bits.way_en   := req_way_en
-
-    when (io.meta_write.fire()) {
-      when (no_replay) {
-        // no need to replay, exit now
-        state := s_invalid
-      } .otherwise {
-        state := s_send_resp
-      }
-    }
-  }
-
-  // --------------------------------------------
-  when (state === s_send_resp) {
-    io.resp.valid := true.B
-    io.resp.bits.client_id := req.client_id
-    io.resp.bits.entry_id := io.id
-
-    when (io.resp.fire()) {
-      // additional assertion
-      val (is_hit, _, coh_on_hit) = new_coh.onAccess(req.cmd)
-      assert(is_hit, "We still don't have permissions for this block")
-      assert(new_coh === coh_on_hit, "Incorrect coherence meta data")
-
-      state := s_client_finish
-    }
-  }
-
-  when (state === s_client_finish) {
-    io.finish.ready := true.B
-    when (io.finish.fire()) {
-      state := s_invalid
-    }
-  }
-}
-
-
-class MissQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
-{
-  val io = IO(new Bundle {
-    val req    = Flipped(DecoupledIO(new MissReq))
-    val resp   = ValidIO(new MissResp)
-    val finish = Flipped(DecoupledIO(new MissFinish))
-    val refill = ValidIO(new Refill)
-
-    val mem_acquire = Decoupled(new TLBundleA(edge.bundle))
-    val mem_grant   = Flipped(Decoupled(new TLBundleD(edge.bundle)))
-    val mem_finish  = Decoupled(new TLBundleE(edge.bundle))
-
-    val wb_req      = Decoupled(new WritebackReq(edge.bundle.sourceBits))
-    val wb_resp     = Input(Bool())
-
-    val meta_write  = Decoupled(new L1MetaWriteReq)
-    val data_write      = Decoupled(new L1DataWriteReq)
-
-    val probe_wb_req = Flipped(ValidIO(new WritebackReq(edge.bundle.sourceBits)))
-    val probe_active = Flipped(ValidIO(UInt()))
-
-    val inflight_req_idxes       = Output(Vec(cfg.nMissEntries, Valid(UInt())))
-    val inflight_req_block_addrs = Output(Vec(cfg.nMissEntries, Valid(UInt())))
-
-    val block_probe_idxes    = Output(Vec(cfg.nMissEntries, Valid(UInt())))
-    val block_probe_addrs    = Output(Vec(cfg.nMissEntries, Valid(UInt())))
-  })
-
-  val resp_arb       = Module(new Arbiter(new MissResp,         cfg.nMissEntries))
-  val refill_arb     = Module(new Arbiter(new Refill,           cfg.nMissEntries))
-  val meta_write_arb = Module(new Arbiter(new L1MetaWriteReq,   cfg.nMissEntries))
-  val data_write_arb = Module(new Arbiter(new L1DataWriteReq,   cfg.nMissEntries))
-  val wb_req_arb     = Module(new Arbiter(new WritebackReq(edge.bundle.sourceBits),     cfg.nMissEntries))
-
-  // assign default values to output signals
-  io.finish.ready := false.B
-  io.mem_grant.ready := false.B
-
-  val entry_alloc_idx = Wire(UInt())
-  val req_ready = WireInit(false.B)
-
-  val entries = (0 until cfg.nMissEntries) map { i =>
-    val entry = Module(new MissEntry(edge))
-
-    entry.io.id := i.U(log2Up(cfg.nMissEntries).W)
-
-    // entry req
-    entry.io.req.valid := (i.U === entry_alloc_idx) && io.req.valid
-    entry.io.req.bits  := io.req.bits
-    when (i.U === entry_alloc_idx) {
-      req_ready := entry.io.req.ready
-    }
-
-    // entry resp
-    resp_arb.io.in(i)       <>  entry.io.resp
-    refill_arb.io.in(i).valid := entry.io.refill.valid
-    refill_arb.io.in(i).bits  := entry.io.refill.bits
-
-    // entry finish
-    entry.io.finish.valid   :=  (i.U === io.finish.bits.entry_id) && io.finish.valid
-    entry.io.finish.bits    :=  io.finish.bits
-    when (entry.io.finish.valid) {
-      io.finish.ready := entry.io.finish.ready
-    }
-
-    meta_write_arb.io.in(i) <>  entry.io.meta_write
-    data_write_arb.io.in(i) <>  entry.io.data_write
-
-    wb_req_arb.io.in(i)     <>  entry.io.wb_req
-    entry.io.wb_resp        :=  io.wb_resp
-    entry.io.probe_wb_req   <>  io.probe_wb_req
-    entry.io.probe_active   <>  io.probe_active
-
-    entry.io.mem_grant.valid := false.B
-    entry.io.mem_grant.bits  := DontCare
-    when (io.mem_grant.bits.source === i.U) {
-      entry.io.mem_grant <> io.mem_grant
-    }
-
-    io.inflight_req_idxes(i)       <> entry.io.block_idx
-    io.inflight_req_block_addrs(i) <> entry.io.block_addr
-    io.block_probe_idxes(i)        <> entry.io.block_probe_idx
-    io.block_probe_addrs(i)        <> entry.io.block_probe_addr
-
-    if (!env.FPGAPlatform) {
-      ExcitingUtils.addSource(
-        BoolStopWatch(
-          start = entry.io.req.fire(), 
-          stop = entry.io.resp.fire(),
-          startHighPriority = true),
-        "perfCntDCacheMissQueuePenaltyEntry" + Integer.toString(i, 10),
-        Perf
-      )
-    }
-
-    entry
-  }
-
-  entry_alloc_idx    := PriorityEncoder(entries.map(m=>m.io.req.ready))
-
-  io.req.ready  := req_ready
-  io.resp.valid := resp_arb.io.out.valid
-  io.resp.bits  := resp_arb.io.out.bits
-  resp_arb.io.out.ready := true.B
-
-  io.refill.valid := refill_arb.io.out.valid
-  io.refill.bits  := refill_arb.io.out.bits
-  refill_arb.io.out.ready := true.B
-
-  // one refill at a time
-  val refill_vec = refill_arb.io.in.map(c => c.valid)
-  assert(PopCount(refill_vec) === 0.U || PopCount(refill_vec) === 1.U)
-
-  io.meta_write <> meta_write_arb.io.out
-  io.data_write <> data_write_arb.io.out
-  io.wb_req     <> wb_req_arb.io.out
-
-  TLArbiter.lowestFromSeq(edge, io.mem_acquire, entries.map(_.io.mem_acquire))
-  TLArbiter.lowestFromSeq(edge, io.mem_finish,  entries.map(_.io.mem_finish))
-
-
-  // print all input/output requests for debug purpose
-
-  // print req
-  val req = io.req
-  XSDebug(req.fire(), "req cmd: %x addr: %x client_id: %d\n",
-    req.bits.cmd, req.bits.addr, req.bits.client_id)
-
-  val resp = io.resp
-  XSDebug(resp.fire(), "resp client_id: %d entry_id: %d\n",
-    resp.bits.client_id, resp.bits.entry_id)
-
-  val finish = io.finish
-  XSDebug(finish.fire(), "finish client_id: %d entry_id: %d\n",
-    finish.bits.client_id, finish.bits.entry_id)
-
-  // print refill
-  XSDebug(io.refill.fire(), "refill addr %x\n", io.refill.bits.addr)
-
-  // print data_write
-  XSDebug(io.data_write.fire(), "data_write addr %x\n", io.data_write.bits.addr)
-
-  // print meta_write
-  XSDebug(io.meta_write.fire(), "meta_write idx %x way_en: %x old_tag: %x new_coh: %d new_tag: %x\n",
-    io.meta_write.bits.idx, io.meta_write.bits.way_en, io.meta_write.bits.tag,
-    io.meta_write.bits.data.coh.state, io.meta_write.bits.data.tag)
-
-  // print wb_req
-  XSDebug(io.wb_req.fire(), "wb_req idx %x tag: %x source: %d param: %x way_en: %x voluntary: %b\n",
-    io.wb_req.bits.idx, io.wb_req.bits.tag,
-    io.wb_req.bits.source, io.wb_req.bits.param,
-    io.wb_req.bits.way_en, io.wb_req.bits.voluntary)
-
-  // print tilelink messages
-  when (io.mem_acquire.fire()) {
-    XSDebug("mem_acquire ")
-    io.mem_acquire.bits.dump
-  }
-  when (io.mem_grant.fire()) {
-    XSDebug("mem_grant ")
-    io.mem_grant.bits.dump
-  }
-  when (io.mem_finish.fire()) {
-    XSDebug("mem_finish ")
-    io.mem_finish.bits.dump
-  }
-
-  if (!env.FPGAPlatform) {
-    ExcitingUtils.addSource(io.req.fire(), "perfCntDCacheMiss", Perf)
-  }
-}
--- a/src/main/scala/xiangshan/cache/probe.scala
+++ b/src/main/scala/xiangshan/cache/probe.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-
-import utils.XSDebug
-import freechips.rocketchip.tilelink._
-import utils.{HasTLDump, XSDebug}
-
-class ProbeUnit(edge: TLEdgeOut) extends DCacheModule with HasTLDump {
-  val io = IO(new Bundle {
-    val req = Flipped(Decoupled(new TLBundleB(edge.bundle)))
-    val rep = Decoupled(new TLBundleC(edge.bundle))
-    val meta_read = Decoupled(new L1MetaReadReq)
-    val meta_resp   = Input(Vec(nWays, new L1Metadata))
-    val meta_write = Decoupled(new L1MetaWriteReq)
-    val wb_req = Decoupled(new WritebackReq(edge.bundle.sourceBits))
-    val wb_resp = Input(Bool())
-    val inflight_req_idx        = Output(Valid(UInt()))
-    val inflight_req_block_addr = Output(Valid(UInt()))
-  })
-
-  val s_invalid :: s_meta_read_req :: s_meta_read_resp :: s_decide_next_state :: s_release :: s_wb_req :: s_wb_resp :: s_meta_write_req :: Nil = Enum(8)
-
-  val state = RegInit(s_invalid)
-
-  val req = Reg(new TLBundleB(edge.bundle))
-  val req_idx = get_idx(req.address)
-  val req_tag = get_tag(req.address)
-  val req_block_addr = get_block_addr(req.address)
-
-  val req_way_en = Reg(UInt())
-  val tag_matches = req_way_en.orR
-  val old_coh = Reg(new ClientMetadata)
-  val miss_coh = ClientMetadata.onReset
-  val reply_coh = Mux(tag_matches, old_coh, miss_coh)
-  val (is_dirty, report_param, new_coh) = reply_coh.onProbe(req.param)
-
-  // assign default values to signals
-  io.req.ready := false.B
-  io.rep.valid := false.B
-  io.rep.bits  := DontCare
-  io.meta_read.valid := false.B
-  io.meta_read.bits  := DontCare
-  io.meta_write.valid := false.B
-  io.meta_write.bits  := DontCare
-  io.wb_req.valid := false.B
-  io.wb_req.bits  := DontCare
-
-  io.inflight_req_idx.valid := state =/= s_invalid
-  io.inflight_req_idx.bits  := req_idx
-
-  io.inflight_req_block_addr.valid := state =/= s_invalid
-  io.inflight_req_block_addr.bits  := req_block_addr
-
-  when (state =/= s_invalid) {
-    XSDebug("state: %d\n", state)
-  }
-
-  when (state === s_invalid) {
-    io.req.ready := true.B
-    when (io.req.fire()) {
-      req := io.req.bits
-      state := s_meta_read_req
-    }
-  }
-
-  when (state === s_meta_read_req) {
-    io.meta_read.valid := true.B
-    val meta_read = io.meta_read.bits
-    meta_read.idx    := req_idx
-    meta_read.way_en := ~0.U(nWays.W)
-    meta_read.tag    := DontCare
-
-    when (io.meta_read.fire()) {
-      state := s_meta_read_resp
-    }
-  }
-
-  when (state === s_meta_read_resp) {
-    // tag check
-    def wayMap[T <: Data](f: Int => T) = VecInit((0 until nWays).map(f))
-    val tag_eq_way = wayMap((w: Int) => io.meta_resp(w).tag === (req_tag)).asUInt
-    val tag_match_way = wayMap((w: Int) => tag_eq_way(w) && io.meta_resp(w).coh.isValid()).asUInt
-    val hit_state = Mux1H(tag_match_way, wayMap((w: Int) => io.meta_resp(w).coh))
-
-    old_coh := hit_state
-    req_way_en := tag_match_way
-
-    state := s_decide_next_state
-  }
-
-  when (state === s_decide_next_state) {
-    // decide next state
-    state := Mux(tag_matches && is_dirty, s_wb_req, s_release)
-  }
-
-
-  // no need to write back, just release
-  when (state === s_release) {
-    io.rep.valid := true.B
-    io.rep.bits  := edge.ProbeAck(req, report_param)
-
-    when (io.rep.fire()) {
-      state := Mux(tag_matches, s_meta_write_req, s_invalid)
-    }
-  }
-
-  when (state === s_wb_req) {
-    io.wb_req.valid          := true.B
-
-    io.wb_req.bits.tag       := req_tag
-    io.wb_req.bits.idx       := req_idx
-    io.wb_req.bits.param     := report_param
-    io.wb_req.bits.way_en    := req_way_en
-    io.wb_req.bits.source    := req.source
-    io.wb_req.bits.voluntary := false.B
-
-    when (io.wb_req.fire()) {
-      state := s_wb_resp
-    }
-  }
-
-  when (state === s_wb_resp) {
-    when (io.wb_resp) {
-      state := s_meta_write_req
-    }
-  }
-
-  when (state === s_meta_write_req) {
-    io.meta_write.valid         := true.B
-    io.meta_write.bits.idx      := req_idx
-    io.meta_write.bits.data.coh := new_coh
-    io.meta_write.bits.data.tag := req_tag
-    io.meta_write.bits.way_en   := req_way_en
-
-    when (io.meta_write.fire()) {
-      state := s_invalid
-    }
-  }
-
-  // print wb_req
-  XSDebug(io.wb_req.fire(), "wb_req idx %x tag: %x source: %d param: %x way_en: %x voluntary: %b\n",
-    io.wb_req.bits.idx, io.wb_req.bits.tag,
-    io.wb_req.bits.source, io.wb_req.bits.param,
-    io.wb_req.bits.way_en, io.wb_req.bits.voluntary)
-
-  // print tilelink messages
-  when (io.req.fire()) {
-    XSDebug("mem_probe ")
-    io.req.bits.dump
-  }
-  when (io.rep.fire()) {
-    XSDebug("mem_release ")
-    io.rep.bits.dump
-  }
-}
--- a/src/main/scala/xiangshan/cache/storeMissQueue.scala
+++ b/src/main/scala/xiangshan/cache/storeMissQueue.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-
-import utils.XSDebug
-import bus.tilelink._
-
-class StoreMissEntry extends DCacheModule
-{
-  val io = IO(new Bundle {
-    val id = Input(UInt())
-
-    val lsu         = Flipped(new DCacheLineIO)
-    val replay      = new DCacheLineIO
-
-    val miss_resp   = Flipped(ValidIO(new MissResp))
-    val miss_finish = DecoupledIO(new MissFinish)
-
-    val idx = Output(Valid(UInt()))
-    val tag = Output(Valid(UInt()))
-  })
-
-  val s_invalid :: s_replay_req :: s_replay_resp :: s_resp :: s_miss_resp :: s_miss_finish :: Nil = Enum(6)
-  val state = RegInit(s_invalid)
-
-  val req     = Reg(new DCacheLineReq )
-  val resp    = Reg(new DCacheLineResp)
-
-  val req_idx = get_idx(req.addr)
-  val req_tag = get_tag(req.addr)
-  val req_block_addr = get_block_addr(req.addr)
-  val reg_miss_resp = Reg(new MissResp)
-
-  // assign default values to output signals
-  io.lsu.req.ready     := state === s_invalid
-  io.lsu.resp.valid    := false.B
-  io.lsu.resp.bits     := DontCare
-
-  io.replay.req.valid  := false.B
-  io.replay.req.bits   := DontCare
-  io.replay.resp.ready := false.B
-
-  io.miss_finish.valid := false.B
-  io.miss_finish.bits  := DontCare
-
-  io.idx.valid := state =/= s_invalid
-  io.tag.valid := state =/= s_invalid
-  io.idx.bits := req_idx
-  io.tag.bits := req_tag
-
-
-  when (state =/= s_invalid) {
-    XSDebug("entry: %d state: %d idx: %x tag: %x\n", io.id, state, io.idx.bits, io.tag.bits)
-  }
-
-  // --------------------------------------------
-  // s_invalid: receive requests
-  when (state === s_invalid) {
-    when (io.lsu.req.fire()) {
-      assert(io.lsu.req.bits.cmd === M_XWR)
-      assert(!io.lsu.req.bits.meta.replay)
-      req   := io.lsu.req.bits
-      state := s_replay_req
-    }
-  }
-
-  // --------------------------------------------
-  // replay
-  when (state === s_replay_req) {
-    io.replay.req.valid := true.B
-    io.replay.req.bits  := req
-    // use our own storeMissEntryId
-    // miss resp are routed by this id
-    io.replay.req.bits.meta.id := io.id
-    when (io.replay.req.fire()) {
-      state := s_replay_resp
-    }
-  }
-
-  when (state === s_replay_resp) {
-    io.replay.resp.ready := true.B
-    when (io.replay.resp.fire()) {
-      // req missed
-      when (io.replay.resp.bits.miss) {
-        // replayed reqs should not miss
-        assert(!req.meta.replay)
-        // the req missed and did not enter mshr
-        // so replay it until it hits or enters mshr
-        when (io.replay.resp.bits.replay) {
-          state := s_replay_req
-        } .otherwise {
-          // the req missed and enters mshr
-          // wait for miss response
-          state := s_miss_resp
-        }
-      } .otherwise {
-        // req hits, everything OK
-        resp := io.replay.resp.bits
-        when (!req.meta.replay) {
-          state := s_resp
-        } .otherwise {
-          // if it's a replayed request
-          // we need to tell mshr, we are done
-          state := s_miss_finish
-        }
-      }
-    }
-  }
-
-  when (state === s_miss_resp) {
-    when (io.miss_resp.fire()) {
-      reg_miss_resp   := io.miss_resp.bits
-      // mark req as replayed req
-      req.meta.replay := true.B
-      state           := s_replay_req
-    }
-  }
-
-  when (state === s_miss_finish) {
-    io.miss_finish.valid          := true.B
-    io.miss_finish.bits.client_id := io.id
-    io.miss_finish.bits.entry_id  := reg_miss_resp.entry_id
-    when (io.miss_finish.fire()) {
-      state := s_resp
-    }
-  }
-
-  // --------------------------------------------
-  when (state === s_resp) {
-    io.lsu.resp.valid := true.B
-    io.lsu.resp.bits  := resp
-    // response to sbuffer should carry the original request id
-    io.lsu.resp.bits.meta.id := req.meta.id
-
-    when (io.lsu.resp.fire()) {
-      state := s_invalid
-    }
-  }
-
-  // debug output
-  when (io.lsu.req.fire()) {
-    XSDebug(s"StoreMissEntryTransaction req %d\n", io.id)
-  }
-
-  when (io.lsu.resp.fire()) {
-    XSDebug(s"StoreMissEntryTransaction resp %d\n", io.id)
-  }
-}
-
-
-class StoreMissQueue extends DCacheModule
-{
-  val io = IO(new Bundle {
-    val lsu         = Flipped(new DCacheLineIO)
-    val replay      = new DCacheLineIO
-
-    val miss_resp   = Flipped(ValidIO(new MissResp))
-    val miss_finish = DecoupledIO(new MissFinish)
-  })
-
-  val miss_finish_arb = Module(new Arbiter(new MissFinish, cfg.nStoreMissEntries))
-  val replay_arb      = Module(new Arbiter(new DCacheLineReq,  cfg.nStoreMissEntries))
-  val resp_arb        = Module(new Arbiter(new DCacheLineResp, cfg.nStoreMissEntries))
-
-  val idx_matches = Wire(Vec(cfg.nStoreMissEntries, Bool()))
-  val tag_matches = Wire(Vec(cfg.nStoreMissEntries, Bool()))
-
-  val tag_match   = Mux1H(idx_matches, tag_matches)
-  val idx_match   = idx_matches.reduce(_||_)
-
-  when (io.lsu.req.valid) {
-    XSDebug("idx_match: %b tag_match: %b\n", idx_match, tag_match)
-  }
-
-  val req             = io.lsu.req
-  val entry_alloc_idx = Wire(UInt())
-  val pri_rdy         = WireInit(false.B)
-  val pri_val         = req.valid && !idx_match
-  // sbuffer should not send down the same block twice
-  // what's more, it should allow write into sbuffer
-  // if the same block is being handled dcache
-  // assert(!(req.valid && tag_match))
-
-  io.replay.resp.ready := false.B
-
-  val entry_id_MSB = reqIdWidth - 1
-  val entry_id_LSB = reqIdWidth - storeMissQueueEntryIdWidth
-
-  val entries = (0 until cfg.nStoreMissEntries) map { i =>
-    val entry = Module(new StoreMissEntry)
-
-    entry.io.id := i.U(storeMissQueueEntryIdWidth.W)
-
-    idx_matches(i) := entry.io.idx.valid && entry.io.idx.bits === get_idx(req.bits.addr)
-    tag_matches(i) := entry.io.tag.valid && entry.io.tag.bits === get_tag(req.bits.addr)
-
-    // lsu req and resp
-    val entry_lsu = entry.io.lsu
-    entry_lsu.req.valid := (i.U === entry_alloc_idx) && pri_val
-    when (i.U === entry_alloc_idx) {
-      pri_rdy := entry_lsu.req.ready
-    }
-    entry_lsu.req.bits  := req.bits
-
-    resp_arb.io.in(i)   <> entry_lsu.resp
-
-    // replay req and resp
-    val entry_replay = entry.io.replay
-    replay_arb.io.in(i) <> entry_replay.req
-    replay_arb.io.in(i).bits.meta.id <> Cat(entry.io.id,
-      entry_replay.req.bits.meta.id(entry_id_LSB - 1, 0))
-
-    val resp_entry_id = io.replay.resp.bits.meta.id(entry_id_MSB, entry_id_LSB)
-    entry_replay.resp.valid         := (i.U === resp_entry_id) && io.replay.resp.valid
-    entry_replay.resp.bits          := io.replay.resp.bits
-    entry_replay.resp.bits.meta.id  :=  Cat(0.U(storeMissQueueEntryIdWidth.W),
-      io.replay.resp.bits.meta.id(entry_id_LSB - 1, 0))
-    when (entry_replay.resp.valid) {
-      io.replay.resp.ready := entry_replay.resp.ready
-    }
-
-    entry.io.miss_resp.valid := (i.U === io.miss_resp.bits.client_id) && io.miss_resp.valid
-    entry.io.miss_resp.bits  := io.miss_resp.bits
-
-    miss_finish_arb.io.in(i) <> entry.io.miss_finish
-    entry
-  }
-
-  entry_alloc_idx    := PriorityEncoder(entries.map(m=>m.io.lsu.req.ready))
-
-  // whenever index matches, do not let it in
-  req.ready      := pri_rdy && !idx_match
-  io.lsu.resp    <> resp_arb.io.out
-  io.replay.req  <> replay_arb.io.out
-  io.miss_finish <> miss_finish_arb.io.out
-
-  // debug output
-  when (req.fire()) {
-    XSDebug(s"req cmd: %x addr: %x data: %x mask: %x id: %d replay: %b\n",
-      req.bits.cmd, req.bits.addr, req.bits.data, req.bits.mask, req.bits.meta.id, req.bits.meta.replay)
-  }
-
-  val replay = io.replay.req
-  when (replay.fire()) {
-    XSDebug(s"replay cmd: %x addr: %x data: %x mask: %x id: %d replay: %b\n",
-      replay.bits.cmd, replay.bits.addr, replay.bits.data, replay.bits.mask, replay.bits.meta.id, replay.bits.meta.replay)
-  }
-
-  val resp = io.lsu.resp
-  when (resp.fire()) {
-    XSDebug(s"resp: data: %x id: %d replay: %b miss: %b replay: %b\n",
-      resp.bits.data, resp.bits.meta.id, resp.bits.meta.replay, resp.bits.miss, resp.bits.replay)
-  }
-
-  val miss_resp = io.miss_resp
-  XSDebug(miss_resp.fire(), "miss_resp client_id: %d entry_id: %d\n",
-    miss_resp.bits.client_id, miss_resp.bits.entry_id)
-
-  val miss_finish = io.miss_finish
-  XSDebug(miss_finish.fire(), "miss_finish client_id: %d entry_id: %d\n",
-    miss_finish.bits.client_id, miss_finish.bits.entry_id)
-}
--- a/src/main/scala/xiangshan/cache/stu.scala
+++ b/src/main/scala/xiangshan/cache/stu.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-
-import utils.{XSDebug}
-
-class StorePipe extends DCacheModule
-{
-  val io = IO(new DCacheBundle{
-    val lsu        = Flipped(new DCacheLineIO)
-    val data_read  = DecoupledIO(new L1DataReadReq)
-    val data_resp  = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
-    val data_write = DecoupledIO(new L1DataWriteReq)
-    val meta_read  = DecoupledIO(new L1MetaReadReq)
-    val meta_resp  = Input(Vec(nWays, new L1Metadata))
-    val inflight_req_idxes       = Output(Vec(3, Valid(UInt())))
-    val inflight_req_block_addrs = Output(Vec(3, Valid(UInt())))
-
-    // send miss request to miss queue
-    val miss_req    = DecoupledIO(new MissReq)
-  })
-
-
-  // LSU requests
-  io.lsu.req.ready := io.meta_read.ready && io.data_read.ready
-  io.meta_read.valid := io.lsu.req.valid
-  io.data_read.valid := io.lsu.req.valid
-
-  val meta_read = io.meta_read.bits
-  val data_read = io.data_read.bits
-
-  // Tag read for new requests
-  meta_read.idx    := get_idx(io.lsu.req.bits.addr)
-  meta_read.way_en := ~0.U(nWays.W)
-  meta_read.tag    := DontCare
-  // Data read for new requests
-  data_read.addr   := io.lsu.req.bits.addr
-  data_read.way_en := ~0.U(nWays.W)
-  data_read.rmask  := ~0.U(blockRows.W)
-
-  // Pipeline
-  // stage 0
-  val s0_valid = io.lsu.req.fire()
-  val s0_req = io.lsu.req.bits
-
-  assert(!(s0_valid && s0_req.cmd =/= MemoryOpConstants.M_XWR), "StorePipe only accepts store req")
-
-  dump_pipeline_reqs("StorePipe s0", s0_valid, s0_req)
-
-  // stage 1
-  val s1_req = RegNext(s0_req)
-  val s1_valid = RegNext(s0_valid, init = false.B)
-  val s1_addr = s1_req.addr
-  val s1_nack = false.B 
-
-  dump_pipeline_reqs("StorePipe s1", s1_valid, s1_req)
-
-  val meta_resp = io.meta_resp
-  // tag check
-  def wayMap[T <: Data](f: Int => T) = VecInit((0 until nWays).map(f))
-  val s1_tag_eq_way = wayMap((w: Int) => meta_resp(w).tag === (get_tag(s1_addr))).asUInt
-  val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && meta_resp(w).coh.isValid()).asUInt
-  val s1_tag_match = s1_tag_match_way.orR
-  val s1_hit_meta = Mux1H(s1_tag_match_way, wayMap((w: Int) => meta_resp(w)))
-  val s1_hit_state = s1_hit_meta.coh
-
-  // replacement policy
-  val replacer = cacheParams.replacement
-  val s1_repl_way_en = UIntToOH(replacer.way)
-  val s1_repl_meta = Mux1H(s1_repl_way_en, wayMap((w: Int) => meta_resp(w)))
-  when (io.miss_req.fire()) {
-    replacer.miss
-  }
-
-
-  // stage 2
-  val s2_req   = RegNext(s1_req)
-  val s2_valid = RegNext(s1_valid, init = false.B)
-
-  dump_pipeline_reqs("StorePipe s2", s2_valid, s2_req)
-
-  val s2_tag_match_way = RegNext(s1_tag_match_way)
-  val s2_tag_match     = RegNext(s1_tag_match)
-
-  val s2_hit_meta      = RegNext(s1_hit_meta)
-  val s2_hit_state     = RegNext(s1_hit_state)
-  val s2_has_permission = s2_hit_state.onAccess(s2_req.cmd)._1
-  val s2_new_hit_state  = s2_hit_state.onAccess(s2_req.cmd)._3
-
-  val s2_repl_meta     = RegNext(s1_repl_meta)
-  val s2_repl_way_en   = RegNext(s1_repl_way_en)
-
-  val s2_old_meta      = Mux(s2_tag_match, s2_hit_meta, s2_repl_meta)
-  val s2_way_en        = Mux(s2_tag_match, s2_tag_match_way, s2_repl_way_en)
-
-  // we not only need permissions
-  // we also require that state does not change on hit
-  // thus we require new_hit_state === old_hit_state
-  //
-  // If state changes on hit,
-  // we should treat it as not hit, and let mshr deal with it,
-  // since we can not write meta data on the main pipeline.
-  // It's possible that we had permission but state changes on hit:
-  // eg: write to exclusive but clean block
-  val s2_hit = s2_tag_match && s2_has_permission && s2_hit_state === s2_new_hit_state
-  val s2_nack = Wire(Bool())
-
-  // when req got nacked, upper levels should replay this request
-
-  // the same set is busy
-  val s2_nack_hit    = RegNext(s1_nack)
-  // can no allocate mshr for store miss
-  val s2_nack_no_mshr = io.miss_req.valid && !io.miss_req.ready
-  // Bank conflict on data arrays
-  // For now, we use DuplicatedDataArray, so no bank conflicts
-  val s2_nack_data   = false.B
-
-  s2_nack   := s2_nack_hit || s2_nack_no_mshr || s2_nack_data
-
-  val s2_info = p"tag match: $s2_tag_match hasPerm: $s2_has_permission" +
-    p" hit state: $s2_hit_state new state: $s2_new_hit_state s2_nack: $s2_nack\n"
-
-  // deal with data
-  val data_resp = io.data_resp
-  val s2_data = Mux1H(s2_tag_match_way, data_resp)
-  val s2_data_decoded = (0 until blockRows) map { r =>
-    (0 until rowWords) map { w =>
-      val data = s2_data(r)(encWordBits * (w + 1) - 1, encWordBits * w)
-      val decoded = cacheParams.dataCode.decode(data)
-      assert(!(s2_valid && s2_hit && !s2_nack && decoded.uncorrectable))
-      decoded.corrected
-    }
-  }
-
-  val wdata_merged = Wire(Vec(blockRows, UInt(encRowBits.W)))
-
-  def mergePutData(old_data: UInt, new_data: UInt, wmask: UInt): UInt = {
-    val full_wmask = FillInterleaved(8, wmask)
-    ((~full_wmask & old_data) | (full_wmask & new_data))
-  }
-
-  // now, we do not deal with ECC
-  for (i <- 0 until blockRows) {
-    wdata_merged(i) := Cat((0 until rowWords).reverse map { w =>
-      val old_data = s2_data_decoded(i)(w)
-      val new_data = s2_req.data(rowBits * (i + 1) - 1, rowBits * i)(wordBits * (w + 1) - 1, wordBits * w)
-      val wmask = s2_req.mask(rowBytes * (i + 1) - 1, rowBytes * i)(wordBytes * (w + 1) - 1, wordBytes * w)
-      val wdata = mergePutData(old_data, new_data, wmask)
-      val wdata_encoded = cacheParams.dataCode.encode(wdata)
-      wdata_encoded
-    })
-  }
-
-
-  // write dcache if hit
-  val data_write = io.data_write.bits
-  io.data_write.valid := s2_valid && s2_hit
-  data_write.rmask    := DontCare
-  data_write.way_en   := s2_tag_match_way
-  data_write.addr     := s2_req.addr
-  data_write.wmask    := VecInit((0 until blockRows) map (i => ~0.U(rowWords.W)))
-  data_write.data     := wdata_merged
-
-  assert(!(io.data_write.valid && !io.data_write.ready))
-
-  // only dump these signals when they are actually valid
-  dump_pipeline_valids("StorePipe s2", "s2_hit", s2_valid && s2_hit)
-  dump_pipeline_valids("StorePipe s2", "s2_nack", s2_valid && s2_nack)
-  dump_pipeline_valids("StorePipe s2", "s2_nack_hit", s2_valid && s2_nack_hit)
-  dump_pipeline_valids("StorePipe s2", "s2_nack_no_mshr", s2_valid && s2_nack_no_mshr)
-  dump_pipeline_valids("StorePipe s2", "s2_nack_data", s2_valid && s2_nack_data)
-
-  // send load miss to miss queue
-  io.miss_req.valid          := s2_valid && !s2_nack_hit && !s2_nack_data && !s2_hit
-  io.miss_req.bits.cmd       := s2_req.cmd
-  io.miss_req.bits.addr      := get_block_addr(s2_req.addr)
-  io.miss_req.bits.tag_match := s2_tag_match
-  io.miss_req.bits.way_en    := s2_way_en
-  io.miss_req.bits.old_meta  := s2_old_meta
-  io.miss_req.bits.client_id := s2_req.meta.id
-
-
-  val resp = Wire(Valid(new DCacheLineResp))
-  resp.valid     := s2_valid
-  resp.bits.data := DontCare
-  resp.bits.meta := s2_req.meta
-  resp.bits.miss := !s2_hit || s2_nack
-  resp.bits.replay := resp.bits.miss && (!io.miss_req.fire() || s2_nack)
-
-  io.lsu.resp.valid := resp.valid
-  io.lsu.resp.bits  := resp.bits
-  assert(!(resp.valid && !io.lsu.resp.ready))
-
-  when (resp.valid) {
-    XSDebug(s"StorePipe resp: data: %x id: %d replayed_req: %b miss: %b need_replay: %b\n",
-      resp.bits.data, resp.bits.meta.id, resp.bits.meta.replay, resp.bits.miss, resp.bits.replay)
-  }
-
-  io.inflight_req_idxes(0).valid := io.lsu.req.valid
-  io.inflight_req_idxes(1).valid := s1_valid
-  io.inflight_req_idxes(2).valid := s2_valid
-
-  io.inflight_req_idxes(0).bits  := get_idx(s0_req.addr)
-  io.inflight_req_idxes(1).bits  := get_idx(s1_req.addr)
-  io.inflight_req_idxes(2).bits  := get_idx(s2_req.addr)
-
-  io.inflight_req_block_addrs(0).valid := io.lsu.req.valid
-  io.inflight_req_block_addrs(1).valid := s1_valid
-  io.inflight_req_block_addrs(2).valid := s2_valid
-
-  io.inflight_req_block_addrs(0).bits  := get_block_addr(s0_req.addr)
-  io.inflight_req_block_addrs(1).bits  := get_block_addr(s1_req.addr)
-  io.inflight_req_block_addrs(2).bits  := get_block_addr(s2_req.addr)
-
-  // -------
-  // Debug logging functions
-  def dump_pipeline_reqs(pipeline_stage_name: String, valid: Bool, req: DCacheLineReq ) = {
-      when (valid) {
-        XSDebug(
-          s"$pipeline_stage_name cmd: %x addr: %x id: %d replay: %b\n",
-          req.cmd, req.addr, req.meta.id, req.meta.replay
-        )
-      }
-  }
-
-  def dump_pipeline_valids(pipeline_stage_name: String, signal_name: String, valid: Bool) = {
-    when (valid) {
-      XSDebug(p"$pipeline_stage_name $signal_name " + s2_info)
-    }
-  }
-}
--- a/src/main/scala/xiangshan/cache/uncache.scala
+++ b/src/main/scala/xiangshan/cache/uncache.scala
@@ -106,10 +106,10 @@ class MMIOEntry(edge: TLEdgeOut) extends DCacheModule
  // --------------------------------------------
  when (state === s_send_resp) {
    io.resp.valid := true.B
-    io.resp.bits.data := resp_data
+    io.resp.bits.data   := resp_data
    // meta data should go with the response
-    io.resp.bits.meta := req.meta
-    io.resp.bits.miss := false.B
+    io.resp.bits.id     := req.id
+    io.resp.bits.miss   := false.B
    io.resp.bits.replay := false.B

    when (io.resp.fire()) {

--- a/src/main/scala/xiangshan/cache/wbu.scala
+++ b/src/main/scala/xiangshan/cache/wbu.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-import utils.XSDebug
-import freechips.rocketchip.tilelink.{TLBundleC, TLEdgeOut, TLPermissions}
-
-class WritebackReq(sourceBits: Int) extends DCacheBundle {
-  val tag = Bits(tagBits.W)
-  val idx = Bits(idxBits.W)
-  val source = UInt(sourceBits.W)
-  val param = UInt(TLPermissions.cWidth.W) 
-  val way_en = Bits(nWays.W)
-  val voluntary = Bool()
-
-  override def cloneType: WritebackReq.this.type = new WritebackReq(sourceBits).asInstanceOf[this.type]
-}
-
-class WritebackUnit(edge: TLEdgeOut) extends DCacheModule {
-  val io = IO(new Bundle {
-    val req = Flipped(DecoupledIO(new WritebackReq(edge.bundle.sourceBits)))
-    val resp = Output(Bool())
-    val data_req = DecoupledIO(new L1DataReadReq)
-    val data_resp = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
-    val release = DecoupledIO(new TLBundleC(edge.bundle))
-    val mem_grant = Input(Bool())
-    val inflight_addr = Output(Valid(UInt()))
-  })
-
-  val req = Reg(new WritebackReq(edge.bundle.sourceBits))
-  val s_invalid :: s_data_read_req :: s_data_read_resp :: s_active :: s_grant :: s_resp :: Nil = Enum(6)
-  val state = RegInit(s_invalid)
-  val should_writeback_data = Reg(Bool())
-
-  val data_req_cnt = RegInit(0.U(log2Up(refillCycles+1).W))
-
-  val (_, last_beat, all_beats_done, beat_count) = edge.count(io.release)
-
-  val wb_buffer = Reg(Vec(refillCycles, UInt(beatBits.W)))
-  val acked = RegInit(false.B)
-
-  // assign default value to signals
-  io.req.ready       := false.B
-  io.resp            := false.B
-
-  io.data_req.valid  := false.B
-  io.data_req.bits   := DontCare
-
-  io.release.valid   := false.B
-  io.release.bits    := DontCare
-
-  io.inflight_addr.valid := state =/= s_invalid
-  io.inflight_addr.bits  := req.idx << blockOffBits
-
-  when (state =/= s_invalid) {
-    XSDebug("state: %d\n", state)
-  }
-
-  when (state === s_invalid) {
-    io.req.ready := true.B
-    when (io.req.fire()) {
-      // for report types: TtoT, BtoB, NtoN, we do nothing
-      import freechips.rocketchip.tilelink.TLPermissions._
-      def is_dirty(x: UInt) = x <= TtoN
-      def do_nothing(x: UInt) = x > BtoN
-      when (do_nothing(io.req.bits.param)) {
-        should_writeback_data := false.B
-        state := s_resp
-      } .otherwise {
-        when (is_dirty(io.req.bits.param)) {
-          state := s_data_read_req
-          should_writeback_data := true.B
-        } .otherwise {
-          state := s_active
-          should_writeback_data := false.B
-        }
-        data_req_cnt := 0.U
-        req := io.req.bits
-        acked := false.B
-      }
-    }
-  }
-
-  val dataArrayLatency = 2
-  val data_array_ctr  = Reg(UInt(log2Up(dataArrayLatency).W))
-
-  when (state === s_data_read_req) {
-    // Data read for new requests
-    io.data_req.valid       := true.B
-    io.data_req.bits.addr   := req.idx << blockOffBits
-    io.data_req.bits.way_en := req.way_en
-    io.data_req.bits.rmask  := ~0.U(blockRows.W)
-
-    when (io.data_req.fire()) {
-      state := s_data_read_resp
-      data_array_ctr := 0.U
-    }
-  }
-
-  when (state === s_data_read_resp) {
-    data_array_ctr := data_array_ctr + 1.U
-    when (data_array_ctr === (dataArrayLatency - 1).U) {
-      val way_idx = OHToUInt(req.way_en)
-      for (i <- 0 until refillCycles) {
-        wb_buffer(i) := Cat((0 until beatRows).reverse map { j =>
-          val idx = i * beatRows + j
-          val row = io.data_resp(way_idx)(idx)
-          // encode each word in this row
-          val row_decoded = Cat((0 until rowWords).reverse map { w =>
-            val data_word = row(encWordBits * (w + 1) - 1, encWordBits * w)
-            val decoded = cacheParams.dataCode.decode(data_word)
-            val data_word_decoded = decoded.corrected
-            assert(!decoded.uncorrectable)
-            data_word_decoded
-          })
-        row_decoded
-        })
-      }
-
-      state := s_active
-    }
-  }
-
-  // release
-  val r_address = (Cat(req.tag, req.idx) << blockOffBits).asUInt()
-  val id = cfg.nMissEntries
-
-  val probeResponse = edge.ProbeAck(
-    fromSource = req.source,
-    toAddress = r_address,
-    lgSize = log2Ceil(cfg.blockBytes).U,
-    reportPermissions = req.param
-  )
-
-  val probeResponseData = edge.ProbeAck(
-    fromSource = req.source,
-    toAddress = r_address,
-    lgSize = log2Ceil(cfg.blockBytes).U,
-    reportPermissions = req.param,
-    data = wb_buffer(data_req_cnt)
-  )
-
-  val voluntaryRelease = edge.Release(
-    fromSource = id.U,
-    toAddress = r_address,
-    lgSize = log2Ceil(cfg.blockBytes).U,
-    shrinkPermissions = req.param
-  )._2
-
-  val voluntaryReleaseData = edge.Release(
-    fromSource = id.U,
-    toAddress = r_address,
-    lgSize = log2Ceil(cfg.blockBytes).U,
-    shrinkPermissions = req.param,
-    data = wb_buffer(data_req_cnt)
-  )._2
-
-  when (state === s_active) {
-    io.release.valid := data_req_cnt < refillCycles.U
-    io.release.bits  := Mux(req.voluntary,
-      Mux(should_writeback_data, voluntaryReleaseData, voluntaryRelease),
-      Mux(should_writeback_data, probeResponseData, probeResponse))
-
-    when (io.mem_grant) {
-      acked := true.B
-    }
-
-    when (io.release.fire()) {
-      data_req_cnt := data_req_cnt + 1.U
-
-      val last_beat = Mux(should_writeback_data, data_req_cnt === (refillCycles-1).U, true.B)
-
-      when (last_beat) {
-        state := Mux(req.voluntary, s_grant, s_resp)
-      }
-    }
-  }
-  
-  when (state === s_grant) {
-    when (io.mem_grant) {
-      acked := true.B
-    }
-    when (acked) {
-      state := s_resp
-    }
-  }
-
-  when (state === s_resp) {
-    io.resp := true.B
-    state := s_invalid
-  }
-
-  // print all input/output requests for debug purpose
-  // print req
-  val io_req = io.req.bits
-  XSDebug(io.req.fire(), "req tag: %x idx: %x source: %d param: %x way_en: %x voluntary: %b\n",
-    io_req.tag, io_req.idx, io_req.source, io_req.param, io_req.way_en, io_req.voluntary)
-
-  // print data req
-  val io_data_req = io.data_req.bits
-  XSDebug(io.data_req.fire(), "data_req addr: %x way_en: %x\n", io_data_req.addr, io_data_req.way_en)
-
-  // print release
-  // XSDebug.exec(io.release.fire(), io.release.bits.dump)
-
-  // print mem_grant
-  XSDebug(io.mem_grant, "mem_grant\n")
-}
--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
@@ -557,14 +557,7 @@ class LoadQueue extends XSModule
  io.uncache.req.bits.data := dataModule.io.uncache.rdata.data
  io.uncache.req.bits.mask := dataModule.io.uncache.rdata.mask

-  io.uncache.req.bits.meta.id       := DontCare
-  io.uncache.req.bits.meta.vaddr    := DontCare
-  io.uncache.req.bits.meta.paddr    := dataModule.io.uncache.rdata.paddr
-  io.uncache.req.bits.meta.uop      := uop(deqPtr)
-  io.uncache.req.bits.meta.mmio     := true.B
-  io.uncache.req.bits.meta.tlb_miss := false.B
-  io.uncache.req.bits.meta.mask     := dataModule.io.uncache.rdata.mask
-  io.uncache.req.bits.meta.replay   := false.B
+  io.uncache.req.bits.id   := DontCare

  io.uncache.resp.ready := true.B


--- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
@@ -215,14 +215,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
  io.uncache.req.bits.data := dataModule.io.rdata(0).data
  io.uncache.req.bits.mask := dataModule.io.rdata(0).mask

-  io.uncache.req.bits.meta.id       := DontCare
-  io.uncache.req.bits.meta.vaddr    := DontCare
-  io.uncache.req.bits.meta.paddr    := dataModule.io.rdata(0).paddr
-  io.uncache.req.bits.meta.uop      := uop(deqPtr)
-  io.uncache.req.bits.meta.mmio     := true.B
-  io.uncache.req.bits.meta.tlb_miss := false.B
-  io.uncache.req.bits.meta.mask     := dataModule.io.rdata(0).mask
-  io.uncache.req.bits.meta.replay   := false.B
+  io.uncache.req.bits.id   := DontCare

  when(io.uncache.req.fire()){
    pending(deqPtr) := false.B
@@ -282,11 +275,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
    io.sbuffer(i).bits.addr := dataModuleRead(i).paddr
    io.sbuffer(i).bits.data := dataModuleRead(i).data
    io.sbuffer(i).bits.mask := dataModuleRead(i).mask
-    io.sbuffer(i).bits.meta          := DontCare
-    io.sbuffer(i).bits.meta.tlb_miss := false.B
-    io.sbuffer(i).bits.meta.uop      := DontCare
-    io.sbuffer(i).bits.meta.mmio     := false.B
-    io.sbuffer(i).bits.meta.mask     := dataModuleRead(i).mask
+    io.sbuffer(i).bits.id   := DontCare

    when (io.sbuffer(i).fire()) {
      allocated(ptr) := false.B

--- a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala
@@ -157,10 +157,7 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{
    io.dcache.req.bits.data := genWdata(in.src2, in.uop.ctrl.fuOpType(1,0))
    // TODO: atomics do need mask: fix mask
    io.dcache.req.bits.mask := genWmask(paddr, in.uop.ctrl.fuOpType(1,0))
-    io.dcache.req.bits.meta.id       := DontCare
-    io.dcache.req.bits.meta.paddr    := paddr
-    io.dcache.req.bits.meta.tlb_miss := false.B
-    io.dcache.req.bits.meta.replay   := false.B
+    io.dcache.req.bits.id   := DontCare

    when(io.dcache.req.fire()){
      state := s_cache_resp
@@ -170,7 +167,7 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{
  when (state === s_cache_resp) {
    io.dcache.resp.ready := true.B
    when(io.dcache.resp.fire()) {
-      is_lrsc_valid := io.dcache.resp.bits.meta.id
+      is_lrsc_valid := io.dcache.resp.bits.id
      val rdata = io.dcache.resp.bits.data
      val rdataSel = LookupTree(paddr(2, 0), List(
        "b000".U -> rdata(63, 0),
@@ -232,4 +229,4 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{
  when(io.redirect.valid){
    atom_override_xtval := false.B
  }
-}
\ No newline at end of file
+}
--- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
@@ -44,14 +44,7 @@ class LoadUnit_S0 extends XSModule {
  io.dcacheReq.bits.data := DontCare

  // TODO: update cache meta
-  io.dcacheReq.bits.meta.id       := DontCare
-  io.dcacheReq.bits.meta.vaddr    := s0_vaddr
-  io.dcacheReq.bits.meta.paddr    := DontCare
-  io.dcacheReq.bits.meta.uop      := s0_uop
-  io.dcacheReq.bits.meta.mmio     := false.B
-  io.dcacheReq.bits.meta.tlb_miss := false.B
-  io.dcacheReq.bits.meta.mask     := s0_mask
-  io.dcacheReq.bits.meta.replay   := false.B
+  io.dcacheReq.bits.id   := DontCare

  val addrAligned = LookupTree(s0_uop.ctrl.fuOpType(1, 0), List(
    "b00".U   -> true.B,                   //b

--- a/src/main/scala/xiangshan/mem/sbuffer/FakeSbuffer.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/FakeSbuffer.scala
@@ -57,7 +57,7 @@ class FakeSbuffer extends XSModule {
    dcache_req.bits.addr := block_addr(req.addr)
    dcache_req.bits.data := wdataVec.asUInt
    dcache_req.bits.mask := wmaskVec.asUInt
-    dcache_req.bits.meta := DontCare
+    dcache_req.bits.id   := DontCare

    when (dcache_req.fire()) {
      state := s_resp

--- a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala
@@ -355,9 +355,8 @@ class NewSbuffer extends XSModule with HasSbufferCst {
  io.dcache.req.bits.addr := getAddr(tagRead(prepareIdx))
  io.dcache.req.bits.data := bufferRead(prepareIdx).data
  io.dcache.req.bits.mask := bufferRead(prepareIdx).mask
-  io.dcache.req.bits.cmd := MemoryOpConstants.M_XWR
-  io.dcache.req.bits.meta := DontCare
-  io.dcache.req.bits.meta.id := prepareIdx
+  io.dcache.req.bits.cmd  := MemoryOpConstants.M_XWR
+  io.dcache.req.bits.id   := prepareIdx
  when(io.dcache.req.fire()){
    stateVec(prepareIdx) := s_inflight
  }
@@ -368,7 +367,7 @@ class NewSbuffer extends XSModule with HasSbufferCst {
  )

  io.dcache.resp.ready := true.B // sbuffer always ready to recv dcache resp
-  val respId = io.dcache.resp.bits.meta.id
+  val respId = io.dcache.resp.bits.id
  when(io.dcache.resp.fire()){
    stateVec(respId) := s_invalid
    assert(stateVec(respId) === s_inflight)

--- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
@@ -368,7 +368,7 @@ class Sbuffer extends XSModule with HasSBufferConst {
  io.dcache.req.bits.data := dcacheData
  io.dcache.req.bits.mask := dcacheMask
  io.dcache.req.bits.cmd  := MemoryOpConstants.M_XWR
-  io.dcache.req.bits.meta := DontCare // NOT USED
+  io.dcache.req.bits.id   := DontCare // NOT USED
  io.dcache.resp.ready := false.B

  wb_arb.io.out.ready := false.B

--- a/src/test/scala/cache/L1plusCacheTest.scala
+++ b/src/test/scala/cache/L1plusCacheTest.scala
@@ -349,14 +349,7 @@ class StoreQueue(nEntries: Int) extends Queue(nEntries, "StoreQueue") {
      req.bits.addr.poke(r.addr.U)
      req.bits.data.poke(r.data.U)
      req.bits.mask.poke(FULL_MASK)
-      req.bits.meta.id.poke(tId.U)
-      req.bits.meta.vaddr.poke(r.addr.U)
-      req.bits.meta.paddr.poke(r.addr.U)
-      // req.bits.meta.uop.poke(0.U.asTypeOf(new MicroOp))
-      req.bits.meta.mmio.poke(false.B)
-      req.bits.meta.tlb_miss.poke(false.B)
-      req.bits.meta.mask.poke(FULL_MASK)
-      req.bits.meta.replay.poke(false.B)
+      req.bits.id.poke(tId.U)
    }

    if (req.valid.peek().litToBoolean && req.ready.peek().litToBoolean) {
@@ -369,7 +362,7 @@ class StoreQueue(nEntries: Int) extends Queue(nEntries, "StoreQueue") {
    // always ready
    resp.ready.poke(true.B)
    if (resp.valid.peek().litToBoolean) {
-      val id = resp.bits.meta.id.peek().litValue.longValue.toInt
+      val id = resp.bits.id.peek().litValue.longValue.toInt
      idPool.free(id)
      retire(id)
    }

--- a/src/test/scala/cache/L2CacheNonInclusiveGetTest.scala
+++ b/src/test/scala/cache/L2CacheNonInclusiveGetTest.scala
@@ -162,14 +162,7 @@ class L2NonInclusiveGetTest extends AnyFlatSpec with ChiselScalatestTester with
          req.bits.addr.poke(addr.U)
          req.bits.data.poke(0.U)
          req.bits.mask.poke(FULL_MASK_64)
-          req.bits.meta.id.poke(0.U)
-          req.bits.meta.vaddr.poke(addr.U)
-          req.bits.meta.paddr.poke(addr.U)
-          // req.bits.meta.uop.poke(0.U.asTypeOf(new MicroOp))
-          req.bits.meta.mmio.poke(true.B)
-          req.bits.meta.tlb_miss.poke(false.B)
-          req.bits.meta.mask.poke(FULL_MASK_64)
-          req.bits.meta.replay.poke(false.B)
+          req.bits.id.poke(0.U)

          while (!req.ready.peek().litToBoolean) {
            c.clock.step()
@@ -204,14 +197,7 @@ class L2NonInclusiveGetTest extends AnyFlatSpec with ChiselScalatestTester with
          req.bits.addr.poke(addr.U)
          req.bits.data.poke(data.U)
          req.bits.mask.poke(FULL_MASK_64)
-          req.bits.meta.id.poke(0.U)
-          req.bits.meta.vaddr.poke(addr.U)
-          req.bits.meta.paddr.poke(addr.U)
-          // req.bits.meta.uop.poke(0.U.asTypeOf(new MicroOp))
-          req.bits.meta.mmio.poke(true.B)
-          req.bits.meta.tlb_miss.poke(false.B)
-          req.bits.meta.mask.poke(FULL_MASK_64)
-          req.bits.meta.replay.poke(false.B)
+          req.bits.id.poke(0.U)

          while (!req.ready.peek().litToBoolean) {
            c.clock.step()

--- a/src/test/scala/cache/L2CacheTest.scala
+++ b/src/test/scala/cache/L2CacheTest.scala
@@ -136,21 +136,21 @@ class L2TestTop()(implicit p: Parameters) extends LazyModule{

    def sendStoreReq(addr: UInt, data: UInt): DCacheLineReq = {
      val req = Wire(new DCacheLineReq)
-      req.cmd := MemoryOpConstants.M_XWR
+      req.cmd  := MemoryOpConstants.M_XWR
      req.addr := addr
      req.data := data
      req.mask := Fill(req.mask.getWidth, true.B)
-      req.meta := DontCare
+      req.id   := DontCare
      req
    }

    def sendLoadReq(addr: UInt): DCacheWordReq = {
      val req = Wire(new DCacheWordReq)
-      req.cmd := MemoryOpConstants.M_XA_ADD
+      req.cmd  := MemoryOpConstants.M_XA_ADD
      req.addr := addr
      req.data := 0.U
      req.mask := Fill(req.mask.getWidth, true.B)
-      req.meta := DontCare
+      req.id   := DontCare
      req
    }


--- a/src/test/scala/cache/UnalignedGetTest.scala
+++ b/src/test/scala/cache/UnalignedGetTest.scala
@@ -257,18 +257,11 @@ class UnalignedGetTestTop()(implicit p: Parameters) extends LazyModule{

    def sendFlushReq(addr: UInt): DCacheWordReq = {
      val req = Wire(new DCacheWordReq)
-      req.cmd := MemoryOpConstants.M_XWR
+      req.cmd  := MemoryOpConstants.M_XWR
      req.addr := FLUSH64_ADDR.U
      req.data := addr
      req.mask := FULL_MASK_64
-      req.meta.id := 0.U
-      req.meta.vaddr := FLUSH64_ADDR.U
-      req.meta.paddr := FLUSH64_ADDR.U
-      req.meta.uop := DontCare
-      req.meta.mmio := true.B
-      req.meta.tlb_miss := false.B
-      req.meta.mask := FULL_MASK_64
-      req.meta.replay := false.B
+      req.id   := 0.U
      req
    }

@@ -278,7 +271,7 @@ class UnalignedGetTestTop()(implicit p: Parameters) extends LazyModule{
      req.addr := addr
      req.data := data
      req.mask := Fill(req.mask.getWidth, true.B)
-      req.meta := DontCare
+      req.id   := DontCare
      req
    }


--- a/src/test/scala/xiangshan/memend/SbufferTest.scala
+++ b/src/test/scala/xiangshan/memend/SbufferTest.scala
@@ -30,7 +30,7 @@ class SbufferWapper extends XSModule {
  // fake dcache
  sbuffer.io.dcache.req.ready := true.B
  sbuffer.io.dcache.resp.valid := RegNext(RegNext(RegNext(RegNext(sbuffer.io.dcache.req.valid))))
-  sbuffer.io.dcache.resp.bits.meta.id := RegNext(RegNext(RegNext(RegNext(sbuffer.io.dcache.req.bits.meta.id))))
+  sbuffer.io.dcache.resp.bits.id := RegNext(RegNext(RegNext(RegNext(sbuffer.io.dcache.req.bits.id))))
 }

 class SbufferTest extends AnyFlatSpec