Merge remote-tracking branch 'origin/master' into missqueue_enq_opt

f773310b · lixin · e8657eeb · 62129679 · f773310b · f773310b
91 changed file
--- a/.github/workflows/emu.yml
+++ b/.github/workflows/emu.yml
@@ -13,12 +13,13 @@ jobs:
    continue-on-error: false
    name: Generate Verilog
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v2
        with:
          submodules: 'recursive'
      - name: set env
        run: |
          export HEAD_SHA=${{ github.run_number }}
+          echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
          echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
          echo "WAVE_HOME=/nfs/home/ci-runner/xs-wave/${HEAD_SHA}" >> $GITHUB_ENV
          mkdir -p /nfs/home/ci-runner/xs-wave/${HEAD_SHA}
@@ -48,12 +49,13 @@ jobs:
    timeout-minutes: 900
    name: EMU - Basics
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v2
        with:
          submodules: 'recursive'
      - name: set env
        run: |
          export HEAD_SHA=${{ github.run_number }}
+          echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
          echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
          echo "AM_HOME=/nfs/home/share/ci-workloads/nexus-am" >> $GITHUB_ENV
          echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV
@@ -104,12 +106,13 @@ jobs:
    timeout-minutes: 900
    name: EMU - Performance
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v2
        with:
          submodules: 'recursive'
      - name: set env
        run: |
          export HEAD_SHA=${{ github.run_number }}
+          echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
          echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
          echo "AM_HOME=/nfs/home/share/ci-workloads/nexus-am" >> $GITHUB_ENV
          echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV
@@ -166,12 +169,13 @@ jobs:
    timeout-minutes: 900
    name: EMU - MC
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v2
        with:
          submodules: 'recursive'
      - name: set env
        run: |
          export HEAD_SHA=${{ github.run_number }}
+          echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
          echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
          echo "AM_HOME=/nfs/home/share/ci-workloads/nexus-am" >> $GITHUB_ENV
          echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV
@@ -199,12 +203,13 @@ jobs:
  #   timeout-minutes: 900
  #   name: SIMV - Basics
  #   steps:
-  #     - uses: actions/checkout@v3
+  #     - uses: actions/checkout@v2
  #       with:
  #         submodules: 'recursive'
  #     - name: set env
  #       run: |
  #         export HEAD_SHA=${{ github.run_number }}
+  #         echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
  #         echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
  #         echo "AM_HOME=/nfs/home/share/ci-workloads/nexus-am" >> $GITHUB_ENV
  #         echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV

--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -13,12 +13,13 @@ jobs:
    # Build + 8 checkpoints * 1-hour timeout
    name: Nightly Regression - Checkpoints
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v2
        with:
          submodules: 'recursive'
      - name: set env
        run: |
          export HEAD_SHA=${{ github.run_number }}
+          echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
          echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
          echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV
          echo "WAVE_HOME=/nfs/home/ci-runner/xs-wave/${HEAD_SHA}" >> $GITHUB_ENV

--- a/.gitmodules
+++ b/.gitmodules
@@ -16,3 +16,6 @@
 [submodule "utility"]
 	path = utility
 	url = https://github.com/OpenXiangShan/utility
+[submodule "coupledL2"]
+	path = coupledL2
+	url = https://github.com/OpenXiangShan/coupledL2
--- a/Makefile
+++ b/Makefile
@@ -21,6 +21,7 @@ TOP_V = $(BUILD_DIR)/$(TOP).v
 SCALA_FILE = $(shell find ./src/main/scala -name '*.scala')
 TEST_FILE = $(shell find ./src/test/scala -name '*.scala')
 MEM_GEN = ./scripts/vlsi_mem_gen
+MEM_GEN_SEP = ./scripts/gen_sep_mem.sh

 SIMTOP  = top.SimTop
 IMAGE  ?= temp
@@ -34,8 +35,8 @@ SIM_MEM_ARGS = --infer-rw --repl-seq-mem -c:$(SIMTOP):-o:$(@D)/$(@F).conf --gen-
 # select firrtl compiler
 ifeq ($(MFC),1)
 override FC_ARGS = --mfc
-override FPGA_MEM_ARGS = --infer-rw
-override SIM_MEM_ARGS = --infer-rw
+override FPGA_MEM_ARGS = --infer-rw --firtool-opt -split-verilog --firtool-opt -o --firtool-opt build --firtool-opt -repl-seq-mem --firtool-opt -repl-seq-mem-circuit=$(FPGATOP) --firtool-opt -repl-seq-mem-file=XSTop.v.conf
+override SIM_MEM_ARGS = --infer-rw --firtool-opt -split-verilog --firtool-opt -o --firtool-opt build --firtool-opt -repl-seq-mem --firtool-opt -repl-seq-mem-circuit=$(SIMTOP) --firtool-opt -repl-seq-mem-file=SimTop.v.conf
 endif


@@ -47,9 +48,11 @@ endif
 override SIM_ARGS += --with-dramsim3
 endif

-# top-down
-ifeq ($(ENABLE_TOPDOWN),1)
-override SIM_ARGS += --enable-topdown
+# dynamic switch CONSTANTIN
+ifeq ($(WITH_CONSTANTIN),0)
+$(info disable WITH_CONSTANTIN)
+else
+override SIM_ARGS += --with-constantin
 endif

 # emu for the release version
@@ -66,10 +69,6 @@ TIME_CMD = time -a -o $(TIMELOG)

 SED_CMD = sed -i -e 's/_\(aw\|ar\|w\|r\|b\)_\(\|bits_\)/_\1/g'

-# add comments to 'firrtl_black_box_resource_files'
-AWK_CMD = gawk -i inplace 'BEGIN{f=0} /FILE "firrtl_black_box_resource_files.f"/{f=1} !f{print $$0} f{print "//", $$0}'
-
-
 .DEFAULT_GOAL = verilog

 help:
@@ -82,10 +81,12 @@ $(TOP_V): $(SCALA_FILE)
 		$(FPGA_MEM_ARGS)                                          \
 		--num-cores $(NUM_CORES)                                  \
 		$(RELEASE_ARGS) $(FC_ARGS)
-	$(SED_CMD) $@
 ifeq ($(MFC),1)
-	$(AWK_CMD) $@
+	for file in $(BUILD_DIR)/*.sv; do $(SED_CMD) "$${file}"; mv "$${file}" "$${file%.sv}.v"; done
+	mv $(BUILD_DIR)/$(BUILD_DIR)/* $(BUILD_DIR)
+	$(MEM_GEN_SEP) "$(MEM_GEN)" "$(TOP_V).conf" "$(BUILD_DIR)"
 endif
+	$(SED_CMD) $@
 	@git log -n 1 >> .__head__
 	@git diff >> .__diff__
 	@sed -i 's/^/\/\// ' .__head__
@@ -107,10 +108,12 @@ $(SIM_TOP_V): $(SCALA_FILE) $(TEST_FILE)
 		$(SIM_MEM_ARGS)                                               \
 		--num-cores $(NUM_CORES)                                      \
 		$(SIM_ARGS) $(FC_ARGS)
-	$(SED_CMD) $@
 ifeq ($(MFC),1)
-	$(AWK_CMD) $@
+	for file in $(BUILD_DIR)/*.sv; do $(SED_CMD) "$${file}"; mv "$${file}" "$${file%.sv}.v"; done
+	mv $(BUILD_DIR)/$(BUILD_DIR)/* $(BUILD_DIR)
+	$(MEM_GEN_SEP) "$(MEM_GEN)" "$(SIM_TOP_V).conf" "$(BUILD_DIR)"
 endif
+	$(SED_CMD) $@
 	@git log -n 1 >> .__head__
 	@git diff >> .__diff__
 	@sed -i 's/^/\/\// ' .__head__

--- a/build.sc
+++ b/build.sc
@@ -119,6 +119,17 @@ object huancun extends XSModule with SbtModule {
  )
 }

+object coupledL2 extends XSModule with SbtModule {
+
+  override def millSourcePath = os.pwd / "coupledL2"
+
+  override def moduleDeps = super.moduleDeps ++ Seq(
+    rocketchip,
+    huancun,
+    utility
+  )
+}
+
 object difftest extends XSModule with SbtModule {
  override def millSourcePath = os.pwd / "difftest"
 }
@@ -141,6 +152,7 @@ trait CommonXiangShan extends XSModule with SbtModule { m =>
  def rocketModule: PublishModule
  def difftestModule: PublishModule
  def huancunModule: PublishModule
+  def coupledL2Module: PublishModule
  def fudianModule: PublishModule
  def utilityModule: PublishModule

@@ -154,6 +166,7 @@ trait CommonXiangShan extends XSModule with SbtModule { m =>
    rocketModule,
    difftestModule,
    huancunModule,
+    coupledL2Module,
    fudianModule,
    utilityModule
  )
@@ -174,6 +187,7 @@ object XiangShan extends CommonXiangShan {
  override def rocketModule = rocketchip
  override def difftestModule = difftest
  override def huancunModule = huancun
+  override def coupledL2Module = coupledL2
  override def fudianModule = fudian
  override def utilityModule = utility
 }
--- a/coupledL2 @ 5b65bc6d
+++ b/coupledL2 @ 5b65bc6d
+Subproject commit 5b65bc6d5f3d7bbbc2ae5f5726dfe2d257170a39
--- a/difftest @ 41a2f27f
+++ b/difftest @ 41a2f27f
-Subproject commit ea83bb7f84115ecfa0568f6697086f186827ea06
+Subproject commit 41a2f27f21744351374e27724ce10a4c8354f400
--- a/huancun @ b7308d95
+++ b/huancun @ b7308d95
-Subproject commit d5b306ce44261e8a703ce6333fa6f4060d7f522c
+Subproject commit b7308d958dfa7073e47ca10ce3974d267592049c
--- a/scripts/gen_sep_mem.sh
+++ b/scripts/gen_sep_mem.sh
+#!/bin/bash
+
+mem_script=$1
+conf_file=$2
+output_dir=$3
+
+IFS=$'\n'
+for line in `cat $conf_file`; do
+  file=`echo "$line" | grep -oP '(?<=name )[^ ]*(?= .*)'`
+  echo $line >${conf_file}.tmp
+  ${mem_script} ${conf_file}.tmp -o ${output_dir}/${file}.v
+done
+
+rm ${conf_file}.tmp
--- a/scripts/top-down/README.md
+++ b/scripts/top-down/README.md
 # top-down 分析工具

+最新的 top-down 分析工具已经与 env-scripts 集成。在使用 `xs_autorun.py` 完成 checkpoint 的运行后，使用 `--report-top-down` 参数即可！
 本仓库集成了 top-down 分析所需要的工具。

 ## 运行仿真

--- a/scripts/top-down/top-down.sh
+++ b/scripts/top-down/top-down.sh
@@ -31,14 +31,14 @@ tmp=$(grep "stall_loads_bound," $filename)
 load_bound_cycles=${tmp##* }
 tmp=$(grep "stall_ls_bandwidth_bound," $filename)
 ls_dq_bound_cycles=${tmp##* }
-tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_rob," $filename)
-stall_cycle_rob=${tmp##* }
-tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_int_dq," $filename)
-stall_cycle_int_dq=${tmp##* }
-tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_fp_dq," $filename)
-stall_cycle_fp_dq=${tmp##* }
-tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_ls_dq," $filename)
-stall_cycle_ls_dq=${tmp##* }
+tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_rob_blame," $filename)
+stall_cycle_rob_blame=${tmp##* }
+tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_int_blame," $filename)
+stall_cycle_int_blame=${tmp##* }
+tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_fp_blame," $filename)
+stall_cycle_fp_blame=${tmp##* }
+tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_ls_blame," $filename)
+stall_cycle_ls_blame=${tmp##* }
 tmp=$(grep "core_with_l2.core.ctrlBlock.rename: stall_cycle_fp," $filename)
 stall_cycle_fp=${tmp##* }
 tmp=$(grep "core_with_l2.core.ctrlBlock.rename: stall_cycle_int," $filename)

--- a/scripts/top-down/top_down.py
+++ b/scripts/top-down/top_down.py
@@ -107,7 +107,7 @@ def process_one(path, head):
    csv_file['ifu2id_allNO_slots'] = use('ifu2id_allNO_cycle') * 6
    csv_file['ifu2id_hvButNotFull_slots'] = use('fetch_bubbles') - use('ifu2id_allNO_slots')

-    stall_cycles_core = use('stall_cycle_fp') + use('stall_cycle_int') + use('stall_cycle_rob') + use('stall_cycle_int_dq') + use('stall_cycle_fp_dq') + use('ls_dq_bound_cycles')
+    stall_cycles_core = use('stall_cycle_fp') + use('stall_cycle_int') + use('stall_cycle_rob_blame') + use('stall_cycle_int_blame') + use('stall_cycle_fp_blame') + use('ls_dq_bound_cycles')

    top = TopDown("Top", 1.0)

@@ -146,9 +146,9 @@ def process_one(path, head):
    loads_bound = memory_bound.add_down("Loads Bound", use('load_bound_cycles') / use('total_cycles'))

 # top->backend_bound->core_bound
-    integer_dq = core_bound.add_down("Integer DQ", core_bound * use('stall_cycle_int_dq') / stall_cycles_core)
-    floatpoint_dq = core_bound.add_down("Floatpoint DQ", core_bound * use('stall_cycle_fp_dq') / stall_cycles_core)
-    rob = core_bound.add_down("ROB", core_bound * use('stall_cycle_rob') / stall_cycles_core)
+    integer_dq = core_bound.add_down("Integer DQ", core_bound * use('stall_cycle_int_blame') / stall_cycles_core)
+    floatpoint_dq = core_bound.add_down("Floatpoint DQ", core_bound * use('stall_cycle_fp_blame') / stall_cycles_core)
+    rob = core_bound.add_down("ROB", core_bound * use('stall_cycle_rob_blame') / stall_cycles_core)
    integer_prf = core_bound.add_down("Integer PRF", core_bound * use('stall_cycle_int') / stall_cycles_core)
    floatpoint_prf = core_bound.add_down("Floatpoint PRF", core_bound * use('stall_cycle_fp') / stall_cycles_core)
    lsu_ports = core_bound.add_down("LSU Ports", core_bound * use('ls_dq_bound_cycles') / stall_cycles_core)

--- a/scripts/utils/convert.sh
+++ b/scripts/utils/convert.sh
@@ -87,20 +87,32 @@ func paramstr(chn, param) {
 }

 {
-  $1 = $NF;                         # timestamp
+  echo = $2;
+  user = $3;
+  data_1 = $4;
+  data_2 = $5;
+  data_3 = $6;
+  data_4 = $7;
+  sink = $9;
+  source = $10;
+
+  $1 = $14;                         # timestamp
+  $2 = $NF;                         # name
+  $3 = chnstr($13)                  # channel
  $NF = "";                         # remove log id
-  $5 = paramstr($3, $5)             # param
-  $4 = opstr($3, $4)                # opcode
-  $3 = chnstr($3)                   # channel
-  for(i=8; i<=12; i++){
-    if(i == 8){                     # col 8 is address
-      $i = sprintf("%lx", $i);
-    } else {                        # cols 9-12 are data
-      $i = sprintf("%016lx", $i);
-    }
-  }
-  $13 = sprintf("user: %lx", $13);
-  $14 = sprintf("echo: %lx", $14);
+  $6 = sink;
+  $7 = source;
+  $5 = paramstr($13, $11)           # param
+  $4 = opstr($13, $12)              # opcode
+
+  $8 = sprintf("%lx", $8)           # address
+  $9  = sprintf("%016lx", data_1)
+  $10 = sprintf("%016lx", data_2)
+  $11 = sprintf("%016lx", data_3)
+  $12 = sprintf("%016lx", data_4)
+
+  $13 = sprintf("user: %lx", user);
+  $14 = sprintf("echo: %lx", echo);
 }

 1                                   # print every line

--- a/scripts/xiangshan.py
+++ b/scripts/xiangshan.py
@@ -88,6 +88,7 @@ class XSArgs(object):
            self.diff = self.diff.replace("nemu-interpreter", "spike")
        self.fork = not args.disable_fork
        self.disable_diff = args.no_diff
+        self.disable_db = args.no_db
        # wave dump path
        if args.wave_dump is not None:
            self.set_wave_home(args.wave_dump)
@@ -246,7 +247,8 @@ class XiangShan(object):
            numa_args = f"numactl -m {numa_info[0]} -C {numa_info[1]}-{numa_info[2]}"
        fork_args = "--enable-fork" if self.args.fork else ""
        diff_args = "--no-diff" if self.args.disable_diff else ""
-        return_code = self.__exec_cmd(f'{numa_args} $NOOP_HOME/build/emu -i {workload} {emu_args} {fork_args} {diff_args}')
+        chiseldb_args = "--dump-db" if not self.args.disable_db else ""
+        return_code = self.__exec_cmd(f'{numa_args} $NOOP_HOME/build/emu -i {workload} {emu_args} {fork_args} {diff_args} {chiseldb_args}')
        return return_code

    def run_simv(self, workload):
@@ -414,6 +416,7 @@ class XiangShan(object):
                    self.__exec_cmd(f"cp $NOOP_HOME/build/*.vcd $WAVE_HOME")
                    self.__exec_cmd(f"cp $NOOP_HOME/build/emu $WAVE_HOME")
                    self.__exec_cmd(f"cp $NOOP_HOME/build/SimTop.v $WAVE_HOME")
+                    self.__exec_cmd(f"cp $NOOP_HOME/build/*.db $WAVE_HOME")
                return ret
        return 0

@@ -436,6 +439,7 @@ class XiangShan(object):
                    self.__exec_cmd(f"cp $NOOP_HOME/build/*.vcd $WAVE_HOME")
                    self.__exec_cmd(f"cp $NOOP_HOME/build/emu $WAVE_HOME")
                    self.__exec_cmd(f"cp $NOOP_HOME/build/SimTop.v $WAVE_HOME")
+                    self.__exec_cmd(f"cp $NOOP_HOME/build/*.db $WAVE_HOME")
                return ret
        return 0

@@ -488,6 +492,7 @@ if __name__ == "__main__":
    parser.add_argument('--disable-fork', action='store_true', help='disable lightSSS')
    parser.add_argument('--no-diff', action='store_true', help='disable difftest')
    parser.add_argument('--ram-size', nargs='?', type=str, help='manually set simulation memory size (8GB by default)')
+    parser.add_argument('--no-db', action='store_true', help='disable chiseldb dump')

    args = parser.parse_args()


--- a/src/main/scala/system/SoC.scala
+++ b/src/main/scala/system/SoC.scala
@@ -24,14 +24,13 @@ import freechips.rocketchip.devices.tilelink.{CLINT, CLINTParams, DevNullParams,
 import freechips.rocketchip.diplomacy.{AddressSet, IdRange, InModuleBody, LazyModule, LazyModuleImp, MemoryDevice, RegionType, SimpleDevice, TransferSizes}
 import freechips.rocketchip.interrupts.{IntSourceNode, IntSourcePortSimple}
 import freechips.rocketchip.regmapper.{RegField, RegFieldAccessType, RegFieldDesc, RegFieldGroup}
-import utility.{BinaryArbiter, TLEdgeBuffer}
+import utility.{BinaryArbiter, TLClientsMerger, TLEdgeBuffer, TLLogger}
 import xiangshan.{DebugOptionsKey, HasXSParameter, XSBundle, XSCore, XSCoreParameters, XSTileKey}
 import freechips.rocketchip.amba.axi4._
 import freechips.rocketchip.tilelink._
 import top.BusPerfMonitor
 import xiangshan.backend.fu.PMAConst
 import huancun._
-import huancun.debug.TLLogger

 case object SoCParamsKey extends Field[SoCParameters]

@@ -42,7 +41,7 @@ case class SoCParameters
  extIntrs: Int = 64,
  L3NBanks: Int = 4,
  L3CacheParamsOpt: Option[HCCacheParameters] = Some(HCCacheParameters(
-    name = "l3",
+    name = "L3",
    level = 3,
    ways = 8,
    sets = 2048 // 1MB per bank
@@ -148,10 +147,13 @@ trait HaveAXI4MemPort {
  ))

  val mem_xbar = TLXbar()
+  val l3_mem_pmu = BusPerfMonitor(name = "L3_Mem", enable = !debugOpts.FPGAPlatform, stat_latency = true, add_reqkey = true)
  mem_xbar :=*
+    TLBuffer.chainNode(2) :=
+    TLCacheCork() :=
+    l3_mem_pmu :=
+    TLClientsMerger() :=
    TLXbar() :=*
-    TLBuffer.chainNode(2) :=*
-    TLCacheCork() :=*
    bankedNode

  mem_xbar :=
@@ -232,10 +234,9 @@ class SoCMisc()(implicit p: Parameters) extends BaseSoC

  val l3_in = TLTempNode()
  val l3_out = TLTempNode()
-  val l3_mem_pmu = BusPerfMonitor(enable = !debugOpts.FPGAPlatform)

  l3_in :*= TLEdgeBuffer(_ => true, Some("L3_in_buffer")) :*= l3_banked_xbar
-  bankedNode :*= TLLogger("MEM_L3", !debugOpts.FPGAPlatform) :*= l3_mem_pmu :*= l3_out
+  bankedNode :*= TLLogger("MEM_L3", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB) :*= l3_out

  if(soc.L3CacheParamsOpt.isEmpty){
    l3_out :*= l3_in
@@ -247,7 +248,7 @@ class SoCMisc()(implicit p: Parameters) extends BaseSoC

  for ((core_out, i) <- core_to_l3_ports.zipWithIndex){
    l3_banked_xbar :=*
-      TLLogger(s"L3_L2_$i", !debugOpts.FPGAPlatform) :=*
+      TLLogger(s"L3_L2_$i", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB) :=*
      TLBuffer() :=
      core_out
  }

--- a/src/main/scala/top/ArgParser.scala
+++ b/src/main/scala/top/ArgParser.scala
@@ -47,10 +47,11 @@ object ArgParser {
    val c = Class.forName(prefix + confString).getConstructor(Integer.TYPE)
    c.newInstance(1.asInstanceOf[Object]).asInstanceOf[Parameters]
  }
-  def parse(args: Array[String]): (Parameters, Array[String], FirrtlCompiler) = {
+  def parse(args: Array[String]): (Parameters, Array[String], FirrtlCompiler, Array[String]) = {
    val default = new DefaultConfig(1)
    var firrtlOpts = Array[String]()
    var firrtlCompiler: FirrtlCompiler = SFC
+    var firtoolOpts = Array[String]()
    @tailrec
    def nextOption(config: Parameters, list: List[String]): Parameters = {
      list match {
@@ -71,6 +72,10 @@ object ArgParser {
          nextOption(config.alter((site, here, up) => {
            case DebugOptionsKey => up(DebugOptionsKey).copy(UseDRAMSim = true)
          }), tail)
+        case "--with-constantin" :: tail =>
+          nextOption(config.alter((site, here, up) => {
+            case DebugOptionsKey => up(DebugOptionsKey).copy(EnableConstantin = true)
+          }), tail)
        case "--fpga-platform" :: tail =>
          nextOption(config.alter((site, here, up) => {
            case DebugOptionsKey => up(DebugOptionsKey).copy(FPGAPlatform = true)
@@ -87,13 +92,12 @@ object ArgParser {
          nextOption(config.alter((site, here, up) => {
            case DebugOptionsKey => up(DebugOptionsKey).copy(EnablePerfDebug = false)
          }), tail)
-        case "--enable-topdown" :: tail =>
-          nextOption(config.alter((site, here, up) => {
-            case DebugOptionsKey => up(DebugOptionsKey).copy(EnableTopDown = true)
-          }), tail)
        case "--mfc" :: tail =>
          firrtlCompiler = MFC
          nextOption(config, tail)
+        case "--firtool-opt" :: option :: tail =>
+          firtoolOpts :+= option
+          nextOption(config, tail)
        case option :: tail =>
          // unknown option, maybe a firrtl option, skip
          firrtlOpts :+= option
@@ -101,6 +105,6 @@ object ArgParser {
      }
    }
    var config = nextOption(default, args.toList)
-    (config, firrtlOpts, firrtlCompiler)
+    (config, firrtlOpts, firrtlCompiler, firtoolOpts)
  }
 }
--- a/src/main/scala/top/BusPerfMonitor.scala
+++ b/src/main/scala/top/BusPerfMonitor.scala
+/***************************************************************************************
+* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
+* Copyright (c) 2020-2021 Peng Cheng Laboratory
+*
+* XiangShan is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+
 package top

 import chipsalliance.rocketchip.config.Parameters
@@ -6,13 +22,28 @@ import freechips.rocketchip.tilelink._
 import chisel3._
 import chisel3.util._
 import utils.{XSPerfAccumulate, XSPerfPrint}
+import freechips.rocketchip.tilelink.TLMessages._
+import freechips.rocketchip.tilelink.TLPermissions._
+import utility.{ReqSourceField, ReqSourceKey, GTimer}
+import xiangshan.MemReqSource

-class BusPerfMonitor()(implicit p: Parameters) extends LazyModule {
-  val node = TLAdapterNode()
-  lazy val module = new BusPerfMonitorImp(this)
+class BusPerfMonitor(name: String, stat_latency: Boolean, add_reqkey: Boolean)(implicit p: Parameters) extends LazyModule {
+  val node = if (add_reqkey) TLAdapterNode(managerFn = { m =>
+    TLSlavePortParameters.v1(
+      m.managers.map { m =>
+        m.v2copy()
+      },
+      requestKeys = Seq(ReqSourceKey),
+      beatBytes = 32,
+      endSinkId = m.endSinkId
+    )
+  }) else {
+    TLAdapterNode()
+  }
+  lazy val module = new BusPerfMonitorImp(this, name, stat_latency)
 }

-class BusPerfMonitorImp(outer: BusPerfMonitor)
+class BusPerfMonitorImp(outer: BusPerfMonitor, name: String, stat_latency: Boolean)
  extends LazyModuleImp(outer)
 {

@@ -24,7 +55,7 @@ class BusPerfMonitorImp(outer: BusPerfMonitor)
  def PERF_CHN[T <: TLChannel](clientName: String, chn: DecoupledIO[T]) = {

    val channelName = chn.bits.channelName.replaceAll(" ", "_").replaceAll("'", "")
-    XSPerfAccumulate(s"${clientName}_${channelName}_fire", chn.fire())
+    XSPerfAccumulate(s"${clientName}_${channelName}_fire", chn.fire)
    XSPerfAccumulate(s"${clientName}_${channelName}_stall", chn.valid && !chn.ready)

    val ops = chn.bits match {
@@ -40,28 +71,28 @@ class BusPerfMonitorImp(outer: BusPerfMonitor)
      chn.bits match {
        case a: TLBundleA =>
          XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
-            i.U === a.opcode && chn.fire()
+            i.U === a.opcode && chn.fire
          )
          XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
            i.U === a.opcode && chn.valid && !chn.ready
          )
        case b: TLBundleB =>
          XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
-            i.U === b.opcode && chn.fire()
+            i.U === b.opcode && chn.fire
          )
          XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
            i.U === b.opcode && chn.valid && !chn.ready
          )
        case c: TLBundleC =>
          XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
-            i.U === c.opcode && chn.fire()
+            i.U === c.opcode && chn.fire
          )
          XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
            i.U === c.opcode && chn.valid && !chn.ready
          )
        case d: TLBundleD =>
          XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
-            i.U === d.opcode && chn.fire()
+            i.U === d.opcode && chn.fire
          )
          XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
            i.U === d.opcode && chn.valid && !chn.ready
@@ -70,22 +101,86 @@ class BusPerfMonitorImp(outer: BusPerfMonitor)
    }
  }

-  for(((in, edgeIn), i) <- outer.node.in.zipWithIndex) {
-    val clientName = s"${edgeIn.master.masters.head.name}_bank_$i"
+  for (((in, edgeIn), i) <- outer.node.in.zipWithIndex) {
+    val clientName = s"${name}_${edgeIn.master.masters.head.name}_bank_$i"
    PERF_CHN(clientName, in.a)
    PERF_CHN(clientName, in.d)
-    if(in.params.hasBCE){
+    if (in.params.hasBCE) {
      PERF_CHN(clientName, in.b)
      PERF_CHN(clientName, in.c)
      PERF_CHN(clientName, in.e)
    }
  }
+
+  if (stat_latency) {
+    val nrEdge = outer.node.in.length.toInt
+    val edgeIn = outer.node.in.head._2
+
+    class RecordEntry()(implicit p: Parameters) extends Bundle {
+      val valid = Bool()
+      val timeStamp = UInt(64.W)
+      val reqType = UInt(8.W)
+    }
+
+    // For simplicity, latency statistic works between nodes with SINGLE edge
+    require(nrEdge == 1)
+    val timer = GTimer()
+    val nrSource = math.pow(2, edgeIn.bundle.sourceBits).toInt
+    val latencyRecord = RegInit(VecInit(Seq.fill(nrSource)(0.U.asTypeOf(new RecordEntry()))))
+    val latencySum = RegInit(0.U(128.W))
+    val nrRecord = RegInit(0.U(128.W))
+
+    outer.node.in.zip(outer.node.out).zipWithIndex.foreach {
+      case (((in, edgeIn), (out, edgeOut)), i) =>
+        val channelA = in.a
+        when(channelA.fire &&
+          channelA.bits.opcode =/= Hint &&
+          channelA.bits.opcode =/= PutFullData &&
+          channelA.bits.opcode =/= PutPartialData
+        ) {
+          // Valid channel A fire, record it
+          assert(latencyRecord(channelA.bits.source).valid === false.B)
+          latencyRecord(channelA.bits.source).valid := true.B
+          latencyRecord(channelA.bits.source).timeStamp := timer
+          latencyRecord(channelA.bits.source).reqType := channelA.bits.user.lift(ReqSourceKey).getOrElse(MemReqSource.NoWhere.id.U)
+        }
+        val channelD = in.d
+        val (first, _, _, _) = edgeIn.count(channelD)
+        // Valid channel D fire, resolve it
+        val resolveRecord = channelD.fire && first &&
+          channelD.bits.opcode =/= ReleaseAck &&
+          channelD.bits.opcode =/= AccessAck
+        val latency = WireInit(0.U(64.W))
+        when(resolveRecord) {
+          assert(latencyRecord(channelD.bits.source).valid === true.B)
+          latencyRecord(channelD.bits.source).valid := false.B
+          latency := timer - latencyRecord(channelD.bits.source).timeStamp
+          latencySum := latencySum + timer
+          nrRecord := nrRecord + 1.U
+          // printf("timer: %x\n", latency)
+        }
+        XSPerfAccumulate(name + "_nrRecord_all", resolveRecord)
+        XSPerfAccumulate(name + "_latencySum_all", Mux(resolveRecord, latency, 0.U))
+
+        for (j <- 0 until MemReqSource.ReqSourceCount.id) {
+          val typeMatch = latencyRecord(channelD.bits.source).reqType === j.U
+          XSPerfAccumulate(name + s"_nrRecord_type${j}", resolveRecord && typeMatch)
+          XSPerfAccumulate(name + s"_latencySum_type${j}", Mux(resolveRecord && typeMatch, latency, 0.U))
+        }
+    }
+  }
+
 }

 object BusPerfMonitor {
-  def apply(enable: Boolean = false)(implicit p: Parameters) = {
+  def apply(
+     name: String,
+     enable: Boolean = false,
+     stat_latency: Boolean = false,
+     add_reqkey: Boolean = false)(implicit p: Parameters) =
+  {
    if(enable){
-      val busPMU = LazyModule(new BusPerfMonitor())
+      val busPMU = LazyModule(new BusPerfMonitor(name, stat_latency, add_reqkey))
      busPMU.node
    } else {
      TLTempNode()

--- a/src/main/scala/top/Configs.scala
+++ b/src/main/scala/top/Configs.scala
@@ -33,6 +33,7 @@ import xiangshan.cache.DCacheParameters
 import xiangshan.cache.mmu.{L2TLBParameters, TLBParameters}
 import device.{EnableJtag, XSDebugModuleParams}
 import huancun._
+import coupledL2._

 class BaseConfig(n: Int) extends Config((site, here, up) => {
  case XLen => 64
@@ -62,10 +63,16 @@ class MinimalConfig(n: Int = 1) extends Config(
        FetchWidth = 4,
        IssQueSize = 8,
        NRPhyRegs = 64,
-        LoadQueueSize = 16,
-        LoadQueueNWriteBanks = 4,
+        VirtualLoadQueueSize = 16,
+        LoadQueueRARSize = 16, 
+        LoadQueueRAWSize = 12, 
+        LoadQueueReplaySize = 8,
+        LoadUncacheBufferSize = 8,
+        LoadQueueNWriteBanks = 4, // NOTE: make sure that LoadQueue{RAR, RAW, Replay}Size is divided by LoadQueueNWriteBanks.
+        RollbackGroupSize = 8,
        StoreQueueSize = 12,
-        StoreQueueNWriteBanks = 4,
+        StoreQueueNWriteBanks = 4, // NOTE: make sure that StoreQueueSize is divided by StoreQueueNWriteBanks
+        StoreQueueForwardWithMask = true,
        RobSize = 32,
        FtqSize = 8,
        IBufSize = 16,
@@ -99,7 +106,8 @@ class MinimalConfig(n: Int = 1) extends Config(
          nReleaseEntries = 1,
          nProbeEntries = 2,
          nPrefetchEntries = 2,
-          hasPrefetch = false
+          nPrefBufferEntries = 32,
+          hasPrefetch = true
        ),
        dcacheParametersOpt = Some(DCacheParameters(
          nSets = 64, // 32KB DCache
@@ -173,7 +181,14 @@ class MinimalConfig(n: Int = 1) extends Config(
          l3nWays = 8,
          spSize = 2,
        ),
-        L2CacheParamsOpt = None, // remove L2 Cache
+        L2CacheParamsOpt = Some(L2Param(
+          name = "L2",
+          ways = 8,
+          sets = 128,
+          echoField = Seq(huancun.DirtyField()),
+          prefetch = None
+        )),
+        L2NBanks = 2,
        prefetcher = None // if L2 pf_recv_node does not exist, disable SMS prefetcher
      )
    )
@@ -183,14 +198,12 @@ class MinimalConfig(n: Int = 1) extends Config(
        L3CacheParamsOpt = Some(up(SoCParamsKey).L3CacheParamsOpt.get.copy(
          sets = 1024,
          inclusive = false,
-          clientCaches = tiles.map{ p =>
-            CacheParameters(
-              "dcache",
-              sets = 2 * p.dcacheParametersOpt.get.nSets,
-              ways = p.dcacheParametersOpt.get.nWays + 2,
-              blockGranularity = log2Ceil(2 * p.dcacheParametersOpt.get.nSets),
-              aliasBitsOpt = None
-            )
+          clientCaches = tiles.map{ core =>
+            val clientDirBytes = tiles.map{ t =>
+              t.L2NBanks * t.L2CacheParamsOpt.map(_.toCacheParams.capacity).getOrElse(0)
+            }.sum
+            val l2params = core.L2CacheParamsOpt.get.toCacheParams
+            l2params.copy(sets = 2 * clientDirBytes / core.L2NBanks / l2params.ways / 64)
          },
          simulation = !site(DebugOptionsKey).FPGAPlatform
        )),
@@ -234,35 +247,25 @@ class WithNKBL2
  n: Int,
  ways: Int = 8,
  inclusive: Boolean = true,
-  banks: Int = 1,
-  alwaysReleaseData: Boolean = false
+  banks: Int = 1
 ) extends Config((site, here, up) => {
  case XSTileKey =>
    val upParams = up(XSTileKey)
    val l2sets = n * 1024 / banks / ways / 64
    upParams.map(p => p.copy(
-      L2CacheParamsOpt = Some(HCCacheParameters(
+      L2CacheParamsOpt = Some(L2Param(
        name = "L2",
-        level = 2,
        ways = ways,
        sets = l2sets,
-        inclusive = inclusive,
-        alwaysReleaseData = alwaysReleaseData,
-        clientCaches = Seq(CacheParameters(
+        clientCaches = Seq(L1Param(
          "dcache",
          sets = 2 * p.dcacheParametersOpt.get.nSets / banks,
          ways = p.dcacheParametersOpt.get.nWays + 2,
-          blockGranularity = log2Ceil(2 * p.dcacheParametersOpt.get.nSets / banks),
          aliasBitsOpt = p.dcacheParametersOpt.get.aliasBitsOpt
        )),
-        reqField = Seq(PreferCacheField()),
-        echoField = Seq(DirtyField()),
-        prefetch = Some(huancun.prefetch.PrefetchReceiverParams()),
-        enablePerf = true,
-        sramDepthDiv = 2,
-        tagECC = Some("secded"),
-        dataECC = Some("secded"),
-        simulation = !site(DebugOptionsKey).FPGAPlatform
+        reqField = Seq(utility.ReqSourceField()),
+        echoField = Seq(huancun.DirtyField()),
+        prefetch = Some(coupledL2.prefetch.PrefetchReceiverParams())
      )),
      L2NBanks = banks
    ))
@@ -292,6 +295,7 @@ class WithNKBL3(n: Int, ways: Int = 8, inclusive: Boolean = true, banks: Int = 1
          address = 0x39000000,
          numCores = tiles.size
        )),
+        reqField = Seq(utility.ReqSourceField()),
        sramClkDivBy2 = true,
        sramDepthDiv = 4,
        tagECC = Some("secded"),
@@ -315,21 +319,21 @@ class DefaultL3DebugConfig(n: Int = 1) extends Config(

 class MinimalAliasDebugConfig(n: Int = 1) extends Config(
  new WithNKBL3(512, inclusive = false) ++
-    new WithNKBL2(256, inclusive = false, alwaysReleaseData = true) ++
+    new WithNKBL2(256, inclusive = false) ++
    new WithNKBL1D(128) ++
    new MinimalConfig(n)
 )

 class MediumConfig(n: Int = 1) extends Config(
  new WithNKBL3(4096, inclusive = false, banks = 4)
-    ++ new WithNKBL2(512, inclusive = false, alwaysReleaseData = true)
+    ++ new WithNKBL2(512, inclusive = false)
    ++ new WithNKBL1D(128)
    ++ new BaseConfig(n)
 )

 class DefaultConfig(n: Int = 1) extends Config(
  new WithNKBL3(6 * 1024, inclusive = false, banks = 4, ways = 6)
-    ++ new WithNKBL2(2 * 512, inclusive = false, banks = 4, alwaysReleaseData = true)
+    ++ new WithNKBL2(2 * 512, inclusive = false, banks = 4)
    ++ new WithNKBL1D(128)
    ++ new BaseConfig(n)
 )
--- a/src/main/scala/top/Generator.scala
+++ b/src/main/scala/top/Generator.scala
@@ -44,7 +44,7 @@ case object MFC extends FirrtlCompiler

 object Generator {

-  def execute(args: Array[String], mod: => RawModule, fc: FirrtlCompiler) = {
+  def execute(args: Array[String], mod: => RawModule, fc: FirrtlCompiler, firtoolOpts: Array[String]) = {
    fc match {
      case MFC =>
        val sfcXsTransforms = Seq(
@@ -69,9 +69,8 @@ object Generator {
        })
        (new circt.stage.ChiselStage).execute(mfcArgs, Seq(
          ChiselGeneratorAnnotation(mod _),
-          circt.stage.CIRCTTargetAnnotation(circt.stage.CIRCTTarget.Verilog),
-          circt.stage.CIRCTHandover(circt.stage.CIRCTHandover.CHIRRTL)
-        ))
+          circt.stage.CIRCTTargetAnnotation(circt.stage.CIRCTTarget.Verilog)
+        ) ++ firtoolOpts.map(opt => circt.stage.FirtoolOption(opt)))
      case SFC =>
        (new XiangShanStage).execute(args, Seq(
          ChiselGeneratorAnnotation(mod _),

--- a/src/main/scala/top/Top.scala
+++ b/src/main/scala/top/Top.scala
@@ -69,7 +69,7 @@ class XSTop()(implicit p: Parameters) extends BaseXSSoc() with HasSoCParameter

  val l3cacheOpt = soc.L3CacheParamsOpt.map(l3param =>
    LazyModule(new HuanCun()(new Config((_, _, _) => {
-      case HCCacheParamsKey => l3param.copy(enableTopDown = debugOpts.EnableTopDown)
+      case HCCacheParamsKey => l3param.copy(hartIds = tiles.map(_.HartId))
    })))
  )

@@ -101,6 +101,8 @@ class XSTop()(implicit p: Parameters) extends BaseXSSoc() with HasSoCParameter
    case Some(l3) =>
      misc.l3_out :*= l3.node :*= TLBuffer.chainNode(2) :*= misc.l3_banked_xbar
    case None =>
+      val dummyMatch = WireDefault(false.B)
+      tiles.map(_.HartId).foreach(hartId => ExcitingUtils.addSource(dummyMatch, s"L3MissMatch_${hartId}", ExcitingUtils.Perf, true))
  }

  lazy val module = new LazyRawModuleImp(this) {
@@ -202,9 +204,17 @@ class XSTop()(implicit p: Parameters) extends BaseXSSoc() with HasSoCParameter

 object TopMain extends App with HasRocketChipStageUtils {
  override def main(args: Array[String]): Unit = {
-    val (config, firrtlOpts, firrtlComplier) = ArgParser.parse(args)
+    val (config, firrtlOpts, firrtlComplier, firtoolOpts) = ArgParser.parse(args)
+
+    // tools: init to close dpi-c when in fpga
+    val envInFPGA = config(DebugOptionsKey).FPGAPlatform
+    val enableChiselDB = config(DebugOptionsKey).EnableChiselDB
+    val enableConstantin = config(DebugOptionsKey).EnableConstantin
+    Constantin.init(enableConstantin && !envInFPGA)
+    ChiselDB.init(enableChiselDB && !envInFPGA)
+
    val soc = DisableMonitors(p => LazyModule(new XSTop()(p)))(config)
-    Generator.execute(firrtlOpts, soc.module, firrtlComplier)
+    Generator.execute(firrtlOpts, soc.module, firrtlComplier, firtoolOpts)
    FileRegisters.write(fileDir = "./build", filePrefix = "XSTop.")
  }
 }
--- a/src/main/scala/utils/LogUtils.scala
+++ b/src/main/scala/utils/LogUtils.scala
@@ -47,7 +47,7 @@ object XSLog {
    if (!debugOpts.FPGAPlatform && (enableDebug || enablePerf || debugLevel == XSLogLevel.ERROR)) {
      ExcitingUtils.addSink(logEnable, "DISPLAY_LOG_ENABLE")
      ExcitingUtils.addSink(logTimestamp, "logTimestamp")
-      val check_cond = (if (debugLevel == XSLogLevel.ERROR) true.B else logEnable) && cond
+      val check_cond = (if (debugLevel == XSLogLevel.ERROR) true.B else logEnable) && cond && RegNext(true.B, false.B)
      when (check_cond) {
        val commonInfo = p"[$debugLevel][time=$logTimestamp] $MagicStr: "
        printf((if (prefix) commonInfo else p"") + pable)

--- a/src/main/scala/xiangshan/Bundle.scala
+++ b/src/main/scala/xiangshan/Bundle.scala
@@ -31,6 +31,7 @@ import xiangshan.frontend.FtqPtr
 import xiangshan.frontend.CGHPtr
 import xiangshan.frontend.FtqRead
 import xiangshan.frontend.FtqToCtrlIO
+import xiangshan.cache.HasDCacheParameters
 import utils._
 import utility._

@@ -57,14 +58,15 @@ object ValidUndirectioned {
 }

 object RSFeedbackType {
-  val tlbMiss = 0.U(3.W)
-  val mshrFull = 1.U(3.W)
-  val dataInvalid = 2.U(3.W)
-  val bankConflict = 3.U(3.W)
-  val ldVioCheckRedo = 4.U(3.W)
-
+  val lrqFull = 0.U(3.W)
+  val tlbMiss = 1.U(3.W)
+  val mshrFull = 2.U(3.W)
+  val dataInvalid = 3.U(3.W)
+  val bankConflict = 4.U(3.W)
+  val ldVioCheckRedo = 5.U(3.W)
  val feedbackInvalid = 7.U(3.W)

+  val allTypes = 8
  def apply() = UInt(3.W)
 }

@@ -89,7 +91,9 @@ class CfiUpdateInfo(implicit p: Parameters) extends XSBundle with HasBPUParamete
  val histPtr = new CGHPtr
  val specCnt = Vec(numBr, UInt(10.W))
  // need pipeline update
-  val br_hit = Bool()
+  val br_hit = Bool() // if in ftb entry
+  val jr_hit = Bool() // if in ftb entry
+  val sc_hit = Bool() // if used in ftb entry, invalid if !br_hit
  val predTaken = Bool()
  val target = UInt(VAddrBits.W)
  val taken = Bool()
@@ -299,6 +303,8 @@ class Redirect(implicit p: Parameters) extends XSBundle {
  val stFtqOffset = UInt(log2Up(PredictWidth).W)

  val debug_runahead_checkpoint_id = UInt(64.W)
+  val debugIsCtrl = Bool()
+  val debugIsMemVio = Bool()

  // def isUnconditional() = RedirectLevel.isUnconditional(level)
  def flushItself() = RedirectLevel.flushItself(level)
@@ -411,6 +417,7 @@ class MemRSFeedbackIO(implicit p: Parameters) extends XSBundle {
 class FrontendToCtrlIO(implicit p: Parameters) extends XSBundle {
  // to backend end
  val cfVec = Vec(DecodeWidth, DecoupledIO(new CtrlFlow))
+  val stallReason = new StallReasonIO(DecodeWidth)
  val fromFtq = new FtqToCtrlIO
  // from backend
  val toFtq = Flipped(new CtrlToFtqIO)
@@ -659,3 +666,13 @@ class MatchTriggerIO(implicit p: Parameters) extends XSBundle {
  val chain = Output(Bool())
  val tdata2 = Output(UInt(64.W))
 }
+
+class StallReasonIO(width: Int) extends Bundle {
+  val reason = Output(Vec(width, UInt(log2Ceil(TopDownCounters.NumStallReasons.id).W)))
+  val backReason = Flipped(Valid(UInt(log2Ceil(TopDownCounters.NumStallReasons.id).W)))
+}
+
+// custom l2 - l1 interface
+class L2ToL1Hint(implicit p: Parameters) extends XSBundle with HasDCacheParameters {
+  val sourceId = UInt(log2Up(cfg.nMissEntries).W)    // tilelink sourceID -> mshr id
+}
--- a/src/main/scala/xiangshan/DbEntry.scala
+++ b/src/main/scala/xiangshan/DbEntry.scala
+package xiangshan
+
+import chipsalliance.rocketchip.config.Parameters
+import chisel3._
+import chisel3.util.log2Ceil
+import xiangshan.backend.rob.{DebugLsInfo, DebugMdpInfo}
+import xiangshan.cache.DCacheBundle
+
+/** Mem */
+class LoadMissEntry(implicit p: Parameters) extends DCacheBundle {
+  val timeCnt = UInt(XLEN.W)
+  val robIdx = UInt(log2Ceil(RobSize).W)
+  val paddr = UInt(PAddrBits.W)
+  val vaddr = UInt(VAddrBits.W)
+  // 1:first hit, 2:first miss, 3:second miss
+  val missState = UInt(3.W)
+}
+
+class InstInfoEntry(implicit p: Parameters) extends XSBundle{
+  val globalID = UInt(XLEN.W)
+  val robIdx = UInt(log2Ceil(RobSize).W)
+  val instType = FuType()
+  val exceptType = UInt(ExceptionVec.ExceptionVecSize.W)
+  val ivaddr = UInt(VAddrBits.W)
+  val dvaddr = UInt(VAddrBits.W) // the l/s access address
+  val dpaddr = UInt(VAddrBits.W) // need the physical address when the TLB is valid
+  val tlbLatency = UInt(XLEN.W)  // original requirements is L1toL2TlbLatency
+  val accessLatency = UInt(XLEN.W)  // RS out time --> write back time
+  val executeLatency = UInt(XLEN.W)
+  val issueLatency = UInt(XLEN.W)
+  val lsInfo = new DebugLsInfo
+  val mdpInfo = new DebugMdpInfo
+  val issueTime = UInt(XLEN.W)
+  val writebackTime = UInt(XLEN.W)
+}
\ No newline at end of file
--- a/src/main/scala/xiangshan/Parameters.scala
+++ b/src/main/scala/xiangshan/Parameters.scala
@@ -30,6 +30,7 @@ import freechips.rocketchip.diplomacy.AddressSet
 import system.SoCParamsKey
 import huancun._
 import huancun.debug._
+import coupledL2._
 import xiangshan.mem.prefetch.{PrefetcherParams, SMSParams}

 import scala.math.min
@@ -66,6 +67,7 @@ case class XSCoreParameters
  UbtbGHRLength: Int = 4,
  // HistoryLength: Int = 512,
  EnableGHistDiff: Boolean = true,
+  EnableCommitGHistDiff: Boolean = true,
  UbtbSize: Int = 256,
  FtbSize: Int = 2048,
  RasSize: Int = 32,
@@ -129,10 +131,16 @@ case class XSCoreParameters
  EnableLoadFastWakeUp: Boolean = true, // NOTE: not supported now, make it false
  IssQueSize: Int = 16,
  NRPhyRegs: Int = 192,
-  LoadQueueSize: Int = 80,
-  LoadQueueNWriteBanks: Int = 8,
+  VirtualLoadQueueSize: Int = 80,
+  LoadQueueRARSize: Int = 80,
+  LoadQueueRAWSize: Int = 64, // NOTE: make sure that LoadQueueRAWSize is power of 2.
+  RollbackGroupSize: Int = 8,
+  LoadQueueReplaySize: Int = 80,
+  LoadUncacheBufferSize: Int = 20,
+  LoadQueueNWriteBanks: Int = 8, // NOTE: make sure that LoadQueueRARSize/LoadQueueRAWSize is divided by LoadQueueNWriteBanks
  StoreQueueSize: Int = 64,
-  StoreQueueNWriteBanks: Int = 8,
+  StoreQueueNWriteBanks: Int = 8, // NOTE: make sure that StoreQueueSize is divided by StoreQueueNWriteBanks
+  StoreQueueForwardWithMask: Boolean = true,
  VlsQueueSize: Int = 8,
  RobSize: Int = 256,
  dpParams: DispatchParameters = DispatchParameters(
@@ -183,6 +191,8 @@ case class XSCoreParameters
    superNWays = 4,
    superReplacer = Some("plru")
  ),
+  itlbPortNum: Int = 2 + ICacheParameters().prefetchPipeNum + 1,
+  ipmpPortNum: Int = 2 + ICacheParameters().prefetchPipeNum + 1,
  ldtlbParameters: TLBParameters = TLBParameters(
    name = "ldtlb",
    normalNSets = 64,
@@ -237,7 +247,8 @@ case class XSCoreParameters
    replacer = Some("setplru"),
    nMissEntries = 2,
    nProbeEntries = 2,
-    nPrefetchEntries = 2,
+    nPrefetchEntries = 12,
+    nPrefBufferEntries = 64,
    hasPrefetch = true,
  ),
  dcacheParametersOpt: Option[DCacheParameters] = Some(DCacheParameters(
@@ -248,12 +259,11 @@ case class XSCoreParameters
    nProbeEntries = 8,
    nReleaseEntries = 18
  )),
-  L2CacheParamsOpt: Option[HCCacheParameters] = Some(HCCacheParameters(
+  L2CacheParamsOpt: Option[L2Param] = Some(L2Param(
    name = "l2",
-    level = 2,
    ways = 8,
    sets = 1024, // default 512KB L2
-    prefetch = Some(huancun.prefetch.PrefetchReceiverParams())
+    prefetch = Some(coupledL2.prefetch.PrefetchReceiverParams())
  )),
  L2NBanks: Int = 1,
  usePTWRepeater: Boolean = false,
@@ -287,7 +297,9 @@ case class DebugOptions
  EnableDebug: Boolean = false,
  EnablePerfDebug: Boolean = true,
  UseDRAMSim: Boolean = false,
-  EnableTopDown: Boolean = false
+  EnableConstantin: Boolean = false,
+  EnableChiselDB: Boolean = false,
+  AlwaysBasicDB: Boolean = true,
 )

 trait HasXSParameter {
@@ -329,6 +341,7 @@ trait HasXSParameter {
  val EnbaleTlbDebug = coreParams.EnbaleTlbDebug
  val HistoryLength = coreParams.HistoryLength
  val EnableGHistDiff = coreParams.EnableGHistDiff
+  val EnableCommitGHistDiff = coreParams.EnableCommitGHistDiff
  val UbtbGHRLength = coreParams.UbtbGHRLength
  val UbtbSize = coreParams.UbtbSize
  val EnableFauFTB = coreParams.EnableFauFTB
@@ -390,10 +403,16 @@ trait HasXSParameter {
  val PhyRegIdxWidth = log2Up(NRPhyRegs)
  val RobSize = coreParams.RobSize
  val IntRefCounterWidth = log2Ceil(RobSize)
-  val LoadQueueSize = coreParams.LoadQueueSize
+  val VirtualLoadQueueSize = coreParams.VirtualLoadQueueSize
+  val LoadQueueRARSize = coreParams.LoadQueueRARSize
+  val LoadQueueRAWSize = coreParams.LoadQueueRAWSize
+  val RollbackGroupSize = coreParams.RollbackGroupSize
+  val LoadQueueReplaySize = coreParams.LoadQueueReplaySize
+  val LoadUncacheBufferSize = coreParams.LoadUncacheBufferSize
  val LoadQueueNWriteBanks = coreParams.LoadQueueNWriteBanks
  val StoreQueueSize = coreParams.StoreQueueSize
  val StoreQueueNWriteBanks = coreParams.StoreQueueNWriteBanks
+  val StoreQueueForwardWithMask = coreParams.StoreQueueForwardWithMask
  val VlsQueueSize = coreParams.VlsQueueSize
  val dpParams = coreParams.dpParams
  val exuParameters = coreParams.exuParameters
@@ -463,7 +482,7 @@ trait HasXSParameter {
  val SSIDWidth = log2Up(LFSTSize)
  val LFSTWidth = 4
  val StoreSetEnable = true // LWT will be disabled if SS is enabled
-
+  val LFSTEnable = false
  val loadExuConfigs = coreParams.loadExuConfigs
  val storeExuConfigs = coreParams.storeExuConfigs


--- a/src/main/scala/xiangshan/XSCore.scala
+++ b/src/main/scala/xiangshan/XSCore.scala
@@ -248,6 +248,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
    val l2_pf_enable = Output(Bool())
    val perfEvents = Input(Vec(numPCntHc * coreParams.L2NBanks, new PerfEvent))
    val beu_errors = Output(new XSL1BusErrors())
+    val l2Hint = Input(Valid(new L2ToL1Hint()))
  })

  println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}")
@@ -302,7 +303,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
  ctrlBlock.io.memoryViolation <> memBlock.io.memoryViolation
  exuBlocks.head.io.scheExtra.enqLsq.get <> memBlock.io.enqLsq
  exuBlocks.foreach(b => {
-    b.io.scheExtra.lcommit := ctrlBlock.io.robio.lsq.lcommit
+    b.io.scheExtra.lcommit := memBlock.io.lqDeq
    b.io.scheExtra.scommit := memBlock.io.sqDeq
    b.io.scheExtra.lqCancelCnt := memBlock.io.lqCancelCnt
    b.io.scheExtra.sqCancelCnt := memBlock.io.sqCancelCnt
@@ -321,9 +322,13 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
  ctrlBlock.io.dispatch <> exuBlocks.flatMap(_.io.in)
  ctrlBlock.io.rsReady := exuBlocks.flatMap(_.io.scheExtra.rsReady)
  ctrlBlock.io.enqLsq <> memBlock.io.enqLsq
+  ctrlBlock.io.lqDeq := memBlock.io.lqDeq
  ctrlBlock.io.sqDeq := memBlock.io.sqDeq
+  ctrlBlock.io.lqCanAccept := memBlock.io.lsqio.lqCanAccept
+  ctrlBlock.io.sqCanAccept := memBlock.io.lsqio.sqCanAccept
  ctrlBlock.io.lqCancelCnt := memBlock.io.lqCancelCnt
  ctrlBlock.io.sqCancelCnt := memBlock.io.sqCancelCnt
+  ctrlBlock.io.robHeadLsIssue := exuBlocks.map(_.io.scheExtra.robHeadLsIssue).reduce(_ || _)

  exuBlocks(0).io.scheExtra.fpRfReadIn.get <> exuBlocks(1).io.scheExtra.fpRfReadOut.get
  exuBlocks(0).io.scheExtra.fpStateReadIn.get <> exuBlocks(1).io.scheExtra.fpStateReadOut.get
@@ -352,8 +357,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
    exu.scheExtra.stIssuePtr <> memBlock.io.stIssuePtr
    exu.scheExtra.debug_fp_rat <> ctrlBlock.io.debug_fp_rat
    exu.scheExtra.debug_int_rat <> ctrlBlock.io.debug_int_rat
-    exu.scheExtra.lqFull := memBlock.io.lqFull
-    exu.scheExtra.sqFull := memBlock.io.sqFull
+    exu.scheExtra.robDeqPtr := ctrlBlock.io.robDeqPtr
    exu.scheExtra.memWaitUpdateReq.staIssue.zip(memBlock.io.stIn).foreach{case (sink, src) => {
      sink.bits := src.bits
      sink.valid := src.valid
@@ -419,6 +423,9 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
  memBlock.io.lsqio.rob <> ctrlBlock.io.robio.lsq
  memBlock.io.lsqio.exceptionAddr.isStore := CommitType.lsInstIsStore(ctrlBlock.io.robio.exception.bits.uop.ctrl.commitType)
  memBlock.io.debug_ls <> ctrlBlock.io.robio.debug_ls
+  memBlock.io.lsTopdownInfo <> ctrlBlock.io.robio.lsTopdownInfo
+  memBlock.io.l2Hint.valid := io.l2Hint.valid
+  memBlock.io.l2Hint.bits.sourceId := io.l2Hint.bits.sourceId

  val itlbRepeater1 = PTWFilter(itlbParams.fenceDelay,frontend.io.ptw, fenceio.sfence, csrioIn.tlb, l2tlbParams.ifilterSize)
  val itlbRepeater2 = PTWRepeaterNB(passReady = false, itlbParams.fenceDelay, itlbRepeater1.io.ptw, ptw.io.tlb(0), fenceio.sfence, csrioIn.tlb)
@@ -428,6 +435,8 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
  ptw.io.csr.tlb <> csrioIn.tlb
  ptw.io.csr.distribute_csr <> csrioIn.customCtrl.distribute_csr

+  ExcitingUtils.addSource(dtlbRepeater1.io.rob_head_miss_in_tlb, s"miss_in_dtlb_${coreParams.HartId}", ExcitingUtils.Perf, true)
+
  // if l2 prefetcher use stream prefetch, it should be placed in XSCore
  io.l2_pf_enable := csrioIn.customCtrl.l2_pf_enable


--- a/src/main/scala/xiangshan/XSTile.scala
+++ b/src/main/scala/xiangshan/XSTile.scala
+/***************************************************************************************
+* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
+* Copyright (c) 2020-2021 Peng Cheng Laboratory
+*
+* XiangShan is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+
 package xiangshan

 import chisel3._
@@ -7,11 +23,10 @@ import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.interrupts._
 import freechips.rocketchip.tile.{BusErrorUnit, BusErrorUnitParams, BusErrors}
 import freechips.rocketchip.tilelink._
-import huancun.debug.TLLogger
-import huancun.{HCCacheParamsKey, HuanCun}
+import coupledL2.{L2ParamKey, CoupledL2}
 import system.HasSoCParameter
 import top.BusPerfMonitor
-import utility.{DelayN, ResetGen, TLClientsMerger, TLEdgeBuffer}
+import utility.{DelayN, ResetGen, TLClientsMerger, TLEdgeBuffer, TLLogger}

 class L1BusErrorUnitInfo(implicit val p: Parameters) extends Bundle with HasSoCParameter {
  val ecc_error = Valid(UInt(soc.PAddrBits.W))
@@ -44,19 +59,20 @@ class XSTileMisc()(implicit p: Parameters) extends LazyModule
  val beu = LazyModule(new BusErrorUnit(
    new XSL1BusErrors(), BusErrorUnitParams(0x38010000)
  ))
-  val busPMU = BusPerfMonitor(enable = !debugOpts.FPGAPlatform)
-  val l1d_logger = TLLogger(s"L2_L1D_${coreParams.HartId}", !debugOpts.FPGAPlatform)
+  val misc_l2_pmu = BusPerfMonitor(name = "Misc_L2", enable = !debugOpts.FPGAPlatform)
+  val l2_l3_pmu = BusPerfMonitor(name = "L2_L3", enable = !debugOpts.FPGAPlatform, stat_latency = true)
+  val l1d_logger = TLLogger(s"L2_L1D_${coreParams.HartId}", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB)
  val l2_binder = coreParams.L2CacheParamsOpt.map(_ => BankBinder(coreParams.L2NBanks, 64))

  val i_mmio_port = TLTempNode()
  val d_mmio_port = TLTempNode()

-  busPMU := l1d_logger
-  l1_xbar :=* busPMU
+  misc_l2_pmu := l1d_logger
+  l1_xbar :=* misc_l2_pmu

  l2_binder match {
    case Some(binder) =>
-      memory_port := TLBuffer.chainNode(2) := TLClientsMerger() := TLXbar() :=* binder
+      memory_port := TLBuffer.chainNode(2) := l2_l3_pmu := TLClientsMerger() := TLXbar() :=* binder
    case None =>
      memory_port := l1_xbar
  }
@@ -79,8 +95,8 @@ class XSTile()(implicit p: Parameters) extends LazyModule
  private val core = LazyModule(new XSCore())
  private val misc = LazyModule(new XSTileMisc())
  private val l2cache = coreParams.L2CacheParamsOpt.map(l2param =>
-    LazyModule(new HuanCun()(new Config((_, _, _) => {
-      case HCCacheParamsKey => l2param.copy(enableTopDown = env.EnableTopDown)
+    LazyModule(new CoupledL2()(new Config((_, _, _) => {
+      case L2ParamKey => l2param.copy(hartIds = Seq(p(XSCoreParamsKey).HartId))
    })))
  )

@@ -92,10 +108,11 @@ class XSTile()(implicit p: Parameters) extends LazyModule
  val debug_int_sink = core.debug_int_sink
  val beu_int_source = misc.beu.intNode
  val core_reset_sink = BundleBridgeSink(Some(() => Reset()))
+  val l1d_l2_pmu = BusPerfMonitor(name = "L1d_L2", enable = !debugOpts.FPGAPlatform, stat_latency = true)

  val l1d_to_l2_bufferOpt = coreParams.dcacheParametersOpt.map { _ =>
    val buffer = LazyModule(new TLBuffer)
-    misc.l1d_logger := buffer.node := core.memBlock.dcache.clientNode
+    misc.l1d_logger := buffer.node := l1d_l2_pmu := core.memBlock.dcache.clientNode
    buffer
  }

@@ -108,29 +125,21 @@ class XSTile()(implicit p: Parameters) extends LazyModule
    (buffers, node)
  }

-  val (l1i_to_l2_buffers, l1i_to_l2_buf_node) = chainBuffer(3, "l1i_to_l2_buffer")
-  misc.busPMU :=
-    TLLogger(s"L2_L1I_${coreParams.HartId}", !debugOpts.FPGAPlatform) :=
-    l1i_to_l2_buf_node :=
-    core.frontend.icache.clientNode
-
-  val ptw_to_l2_buffers = if (!coreParams.softPTW) {
-    val (buffers, buf_node) = chainBuffer(5, "ptw_to_l2_buffer")
-    misc.busPMU :=
-      TLLogger(s"L2_PTW_${coreParams.HartId}", !debugOpts.FPGAPlatform) :=
-      buf_node :=
-      core.ptw_to_l2_buffer.node
-    buffers
-  } else Seq()
+  misc.misc_l2_pmu := TLLogger(s"L2_L1I_${coreParams.HartId}", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB) := core.frontend.icache.clientNode
+  if (!coreParams.softPTW) {
+    misc.misc_l2_pmu := TLLogger(s"L2_PTW_${coreParams.HartId}", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB) := core.ptw_to_l2_buffer.node 
+  } 

  l2cache match {
    case Some(l2) =>
-      misc.l2_binder.get :*= l2.node :*= TLBuffer() :*= TLBuffer() :*= misc.l1_xbar
+      misc.l2_binder.get :*= l2.node :*= misc.l1_xbar
      l2.pf_recv_node.map(recv => {
        println("Connecting L1 prefetcher to L2!")
        recv := core.memBlock.pf_sender_opt.get
      })
    case None =>
+      val dummyMatch = WireDefault(false.B)
+      ExcitingUtils.addSource(dummyMatch, s"L2MissMatch_${p(XSCoreParamsKey).HartId}", ExcitingUtils.Perf, true)
  }

  misc.i_mmio_port := core.frontend.instrUncache.clientNode
@@ -150,8 +159,9 @@ class XSTile()(implicit p: Parameters) extends LazyModule
    core.module.io.hartId := io.hartId
    core.module.io.reset_vector := DelayN(io.reset_vector, 5)
    io.cpu_halt := core.module.io.cpu_halt
-    if(l2cache.isDefined){
-      core.module.io.perfEvents.zip(l2cache.get.module.io.perfEvents.flatten).foreach(x => x._1.value := x._2)
+    if (l2cache.isDefined) {
+      // TODO: add perfEvents of L2
+      // core.module.io.perfEvents.zip(l2cache.get.module.io.perfEvents.flatten).foreach(x => x._1.value := x._2)
    }
    else {
      core.module.io.perfEvents <> DontCare
@@ -159,11 +169,17 @@ class XSTile()(implicit p: Parameters) extends LazyModule

    misc.module.beu_errors.icache <> core.module.io.beu_errors.icache
    misc.module.beu_errors.dcache <> core.module.io.beu_errors.dcache
-    if(l2cache.isDefined){
-      misc.module.beu_errors.l2.ecc_error.valid := l2cache.get.module.io.ecc_error.valid
-      misc.module.beu_errors.l2.ecc_error.bits := l2cache.get.module.io.ecc_error.bits
+    if (l2cache.isDefined) {
+      // TODO: add ECC interface of L2
+      // misc.module.beu_errors.l2.ecc_error.valid := l2cache.get.module.io.ecc_error.valid
+      // misc.module.beu_errors.l2.ecc_error.bits := l2cache.get.module.io.ecc_error.bits
+      misc.module.beu_errors.l2 <> 0.U.asTypeOf(misc.module.beu_errors.l2)
+      core.module.io.l2Hint.bits.sourceId := l2cache.get.module.io.l2_hint.bits
+      core.module.io.l2Hint.valid := l2cache.get.module.io.l2_hint.valid
    } else {
      misc.module.beu_errors.l2 <> 0.U.asTypeOf(misc.module.beu_errors.l2)
+      core.module.io.l2Hint.bits.sourceId := DontCare
+      core.module.io.l2Hint.valid := false.B
    }

    // Modules are reset one by one
@@ -173,8 +189,6 @@ class XSTile()(implicit p: Parameters) extends LazyModule
    // reset ----> OR_SYNC --> {Misc, L2 Cache, Cores}
    val resetChain = Seq(
      Seq(misc.module, core.module) ++
-        l1i_to_l2_buffers.map(_.module.asInstanceOf[MultiIOModule]) ++
-        ptw_to_l2_buffers.map(_.module.asInstanceOf[MultiIOModule]) ++
        l1d_to_l2_bufferOpt.map(_.module) ++
        l2cache.map(_.module)
    )

--- a/src/main/scala/xiangshan/backend/CtrlBlock.scala
+++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala
@@ -27,7 +27,7 @@ import xiangshan.backend.decode.{DecodeStage, FusionDecoder, ImmUnion}
 import xiangshan.backend.dispatch.{Dispatch, Dispatch2Rs, DispatchQueue}
 import xiangshan.backend.fu.PFEvent
 import xiangshan.backend.rename.{Rename, RenameTableWrapper}
-import xiangshan.backend.rob.{DebugLSIO, Rob, RobCSRIO, RobLsqIO}
+import xiangshan.backend.rob.{DebugLSIO, LsTopdownInfo, Rob, RobCSRIO, RobLsqIO, RobPtr}
 import xiangshan.frontend.{FtqPtr, FtqRead, Ftq_RF_Components}
 import xiangshan.mem.mdp.{LFST, SSIT, WaitTable}
 import xiangshan.ExceptionNO._
@@ -87,6 +87,8 @@ class RedirectGenerator(implicit p: Parameters) extends XSModule
    val redirect = Wire(Valid(new Redirect))
    redirect.valid := exuOut.valid && exuOut.bits.redirect.cfiUpdate.isMisPred
    redirect.bits := exuOut.bits.redirect
+    redirect.bits.debugIsCtrl := true.B
+    redirect.bits.debugIsMemVio := false.B
    redirect
  }

@@ -211,9 +213,12 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
    val dispatch = Vec(3*dpParams.IntDqDeqWidth, DecoupledIO(new MicroOp))
    val rsReady = Vec(outer.dispatch2.map(_.module.io.out.length).sum, Input(Bool()))
    val enqLsq = Flipped(new LsqEnqIO)
-    val lqCancelCnt = Input(UInt(log2Up(LoadQueueSize + 1).W))
+    val lqCancelCnt = Input(UInt(log2Up(VirtualLoadQueueSize + 1).W))
    val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W))
+    val lqDeq = Input(UInt(log2Up(CommitWidth + 1).W))
    val sqDeq = Input(UInt(log2Ceil(EnsbufferWidth + 1).W))
+    val sqCanAccept = Input(Bool())
+    val lqCanAccept = Input(Bool())
    val ld_pc_read = Vec(exuParameters.LduCnt, Flipped(new FtqRead(UInt(VAddrBits.W))))
    // from int block
    val exuRedirect = Vec(exuParameters.AluCnt + exuParameters.JmpCnt, Flipped(ValidIO(new ExuOutput)))
@@ -229,6 +234,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
      val lsq = new RobLsqIO
      // debug
      val debug_ls = Flipped(new DebugLSIO)
+      val lsTopdownInfo = Vec(exuParameters.LduCnt, Input(new LsTopdownInfo))
    }
    val csrCtrl = Input(new CustomCSRCtrlIO)
    val perfInfo = Output(new Bundle{
@@ -242,8 +248,11 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
    val writeback = MixedVec(writebackLengths.map(num => Vec(num, Flipped(ValidIO(new ExuOutput)))))
    // redirect out
    val redirect = ValidIO(new Redirect)
+    // debug
    val debug_int_rat = Vec(32, Output(UInt(PhyRegIdxWidth.W)))
    val debug_fp_rat = Vec(32, Output(UInt(PhyRegIdxWidth.W)))
+    val robDeqPtr = Output(new RobPtr)
+    val robHeadLsIssue = Input(Bool())
  })

  override def writebackSource: Option[Seq[Seq[Valid[ExuOutput]]]] = {
@@ -288,6 +297,8 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
  val flushRedirect = Wire(Valid(new Redirect))
  flushRedirect.valid := RegNext(rob.io.flushOut.valid)
  flushRedirect.bits := RegEnable(rob.io.flushOut.bits, rob.io.flushOut.valid)
+  flushRedirect.bits.debugIsCtrl := false.B
+  flushRedirect.bits.debugIsMemVio := false.B

  val flushRedirectReg = Wire(Valid(new Redirect))
  flushRedirectReg.valid := RegNext(flushRedirect.valid, init = false.B)
@@ -310,7 +321,10 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
    !io.memoryViolation.bits.robIdx.needFlush(Seq(stage2Redirect, redirectForExu)),
    init = false.B
  )
-  loadReplay.bits := RegEnable(io.memoryViolation.bits, io.memoryViolation.valid)
+  val memVioBits = WireDefault(io.memoryViolation.bits)
+  memVioBits.debugIsCtrl := false.B
+  memVioBits.debugIsMemVio := true.B
+  loadReplay.bits := RegEnable(memVioBits, io.memoryViolation.valid)
  pcMem.io.raddr(2) := redirectGen.io.redirectPcRead.ptr.value
  redirectGen.io.redirectPcRead.data := pcMem.io.rdata(2).getPc(RegNext(redirectGen.io.redirectPcRead.offset))
  pcMem.io.raddr(3) := redirectGen.io.memPredPcRead.ptr.value
@@ -360,40 +374,8 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
    pendingRedirect := false.B
  }

-  if (env.EnableTopDown) {
-    val stage2Redirect_valid_when_pending = pendingRedirect && stage2Redirect.valid
-
-    val stage2_redirect_cycles = RegInit(false.B)                                         // frontend_bound->fetch_lantency->stage2_redirect
-    val MissPredPending = RegInit(false.B); val branch_resteers_cycles = RegInit(false.B) // frontend_bound->fetch_lantency->stage2_redirect->branch_resteers
-    val RobFlushPending = RegInit(false.B); val robFlush_bubble_cycles = RegInit(false.B) // frontend_bound->fetch_lantency->stage2_redirect->robflush_bubble
-    val LdReplayPending = RegInit(false.B); val ldReplay_bubble_cycles = RegInit(false.B) // frontend_bound->fetch_lantency->stage2_redirect->ldReplay_bubble
-    
-    when(redirectGen.io.isMisspreRedirect) { MissPredPending := true.B }
-    when(flushRedirect.valid)              { RobFlushPending := true.B }
-    when(redirectGen.io.loadReplay.valid)  { LdReplayPending := true.B }
-    
-    when (RegNext(io.frontend.toFtq.redirect.valid)) {
-      when(pendingRedirect) {                             stage2_redirect_cycles := true.B }
-      when(MissPredPending) { MissPredPending := false.B; branch_resteers_cycles := true.B }
-      when(RobFlushPending) { RobFlushPending := false.B; robFlush_bubble_cycles := true.B }
-      when(LdReplayPending) { LdReplayPending := false.B; ldReplay_bubble_cycles := true.B }
-    }
-
-    when(VecInit(decode.io.out.map(x => x.valid)).asUInt.orR){
-      when(stage2_redirect_cycles) { stage2_redirect_cycles := false.B }
-      when(branch_resteers_cycles) { branch_resteers_cycles := false.B }
-      when(robFlush_bubble_cycles) { robFlush_bubble_cycles := false.B }
-      when(ldReplay_bubble_cycles) { ldReplay_bubble_cycles := false.B }
-    }
-
-    XSPerfAccumulate("stage2_redirect_cycles", stage2_redirect_cycles)
-    XSPerfAccumulate("branch_resteers_cycles", branch_resteers_cycles)
-    XSPerfAccumulate("robFlush_bubble_cycles", robFlush_bubble_cycles)
-    XSPerfAccumulate("ldReplay_bubble_cycles", ldReplay_bubble_cycles)
-    XSPerfAccumulate("s2Redirect_pend_cycles", stage2Redirect_valid_when_pending)
-  }
-
  decode.io.in <> io.frontend.cfVec
+  decode.io.stallReason.in <> io.frontend.stallReason
  decode.io.csrCtrl := RegNext(io.csrCtrl)
  decode.io.intRat <> rat.io.intReadPorts
  decode.io.fpRat <> rat.io.fpReadPorts
@@ -416,11 +398,15 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
  waittable.io.csrCtrl := RegNext(io.csrCtrl)

  // LFST lookup and update
-  val lfst = Module(new LFST)
-  lfst.io.redirect <> RegNext(io.redirect)
-  lfst.io.storeIssue <> RegNext(io.stIn)
-  lfst.io.csrCtrl <> RegNext(io.csrCtrl)
-  lfst.io.dispatch <> dispatch.io.lfst
+  dispatch.io.lfst := DontCare
+  if (LFSTEnable) {
+    val lfst = Module(new LFST)
+    lfst.io.redirect <> RegNext(io.redirect)
+    lfst.io.storeIssue <> RegNext(io.stIn)
+    lfst.io.csrCtrl <> RegNext(io.csrCtrl)
+    lfst.io.dispatch <> dispatch.io.lfst
+  }
+

  rat.io.redirect := stage2Redirect.valid
  rat.io.robCommits := rob.io.commits
@@ -479,6 +465,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
  rename.io.ssit <> ssit.io.rdata
  rename.io.debug_int_rat <> rat.io.debug_int_rat
  rename.io.debug_fp_rat <> rat.io.debug_fp_rat
+  rename.io.stallReason.in <> decode.io.stallReason.out

  // pipeline between rename and dispatch
  for (i <- 0 until RenameWidth) {
@@ -492,6 +479,12 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
  dispatch.io.toFpDq <> fpDq.io.enq
  dispatch.io.toLsDq <> lsDq.io.enq
  dispatch.io.allocPregs <> io.allocPregs
+  dispatch.io.robHead := rob.io.debugRobHead
+  dispatch.io.stallReason <> rename.io.stallReason.out
+  dispatch.io.lqCanAccept := io.lqCanAccept
+  dispatch.io.sqCanAccept := io.sqCanAccept
+  dispatch.io.robHeadNotReady := rob.io.headNotReady
+  dispatch.io.robFull := rob.io.robFull
  dispatch.io.singleStep := RegNext(io.csrCtrl.singlestep)

  intDq.io.redirect <> redirectForExu
@@ -513,11 +506,12 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
      val lsqCtrl = Module(new LsqEnqCtrl)
      lsqCtrl.io.redirect <> redirectForExu
      lsqCtrl.io.enq <> dp2.enqLsq.get
-      lsqCtrl.io.lcommit := rob.io.lsq.lcommit
+      lsqCtrl.io.lcommit := io.lqDeq
      lsqCtrl.io.scommit := io.sqDeq
      lsqCtrl.io.lqCancelCnt := io.lqCancelCnt
      lsqCtrl.io.sqCancelCnt := io.sqCancelCnt
      io.enqLsq <> lsqCtrl.io.enqLsq
+      rob.io.debugEnqLsq := io.enqLsq
    }
  }
  for ((dp2In, i) <- outer.dispatch2.flatMap(_.module.io.in).zipWithIndex) {
@@ -570,6 +564,9 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
  io.robio.lsq <> rob.io.lsq

  rob.io.debug_ls := io.robio.debug_ls
+  rob.io.debugHeadLsIssue := io.robHeadLsIssue
+  rob.io.lsTopdownInfo := io.robio.lsTopdownInfo
+  io.robDeqPtr := rob.io.robDeqPtr

  io.perfInfo.ctrlInfo.robFull := RegNext(rob.io.robFull)
  io.perfInfo.ctrlInfo.intdqFull := RegNext(intDq.io.dqFull)

--- a/src/main/scala/xiangshan/backend/MemBlock.scala
+++ b/src/main/scala/xiangshan/backend/MemBlock.scala
@@ -21,16 +21,17 @@ import chisel3._
 import chisel3.util._
 import freechips.rocketchip.diplomacy.{BundleBridgeSource, LazyModule, LazyModuleImp}
 import freechips.rocketchip.tile.HasFPUParameters
-import huancun.PrefetchRecv
+import coupledL2.PrefetchRecv
 import utils._
 import utility._
 import xiangshan._
 import xiangshan.backend.exu.StdExeUnit
 import xiangshan.backend.fu._
-import xiangshan.backend.rob.{DebugLSIO, RobLsqIO}
+import xiangshan.backend.rob.{DebugLSIO, LsTopdownInfo, RobLsqIO}
 import xiangshan.cache._
 import xiangshan.cache.mmu.{VectorTlbPtwIO, TLBNonBlock, TlbReplace}
 import xiangshan.mem._
+import xiangshan.mem.mdp._
 import xiangshan.mem.prefetch.{BasePrefecher, SMSParams, SMSPrefetcher}

 class Std(implicit p: Parameters) extends FunctionUnit {
@@ -73,7 +74,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    val issue = Vec(exuParameters.LsExuCnt + exuParameters.StuCnt, Flipped(DecoupledIO(new ExuInput)))
    val loadFastMatch = Vec(exuParameters.LduCnt, Input(UInt(exuParameters.LduCnt.W)))
    val loadFastImm = Vec(exuParameters.LduCnt, Input(UInt(12.W)))
-    val rsfeedback = Vec(exuParameters.StuCnt, new MemRSFeedbackIO)
+    val rsfeedback = Vec(exuParameters.LsExuCnt, new MemRSFeedbackIO)
    val loadPc = Vec(exuParameters.LduCnt, Input(UInt(VAddrBits.W))) // for hw prefetch
    val stIssuePtr = Output(new SqPtr())
    val int2vlsu = Flipped(new Int2VLSUIO)
@@ -99,6 +100,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    val lsqio = new Bundle {
      val exceptionAddr = new ExceptionAddrIO // to csr
      val rob = Flipped(new RobLsqIO) // rob to lsq
+      val lqCanAccept = Output(Bool())
+      val sqCanAccept = Output(Bool())
    }
    val csrCtrl = Flipped(new CustomCSRCtrlIO)
    val csrUpdate = new DistributedCSRUpdateReq
@@ -108,13 +111,14 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
      val lqFull = Output(Bool())
      val dcacheMSHRFull = Output(Bool())
    }
-    val sqFull = Output(Bool())
-    val lqFull = Output(Bool())
    val perfEventsPTW = Input(Vec(19, new PerfEvent))
-    val lqCancelCnt = Output(UInt(log2Up(LoadQueueSize + 1).W))
+    val lqCancelCnt = Output(UInt(log2Up(VirtualLoadQueueSize + 1).W))
    val sqCancelCnt = Output(UInt(log2Up(StoreQueueSize + 1).W))
    val sqDeq = Output(UInt(log2Ceil(EnsbufferWidth + 1).W))
+    val lqDeq = Output(UInt(log2Up(CommitWidth + 1).W))
    val debug_ls = new DebugLSIO
+    val lsTopdownInfo = Vec(exuParameters.LduCnt, Output(new LsTopdownInfo))
+    val l2Hint = Input(Valid(new L2ToL1Hint()))
  })

  override def writebackSource1: Option[Seq[Seq[DecoupledIO[ExuOutput]]]] = Some(Seq(io.writeback))
@@ -169,7 +173,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)

  loadUnits.zipWithIndex.map(x => x._1.suggestName("LoadUnit_"+x._2))
  storeUnits.zipWithIndex.map(x => x._1.suggestName("StoreUnit_"+x._2))
-
  val atomicsUnit = Module(new AtomicsUnit)

  // Atom inst comes from sta / std, then its result
@@ -178,17 +181,17 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  // However, atom exception will be writebacked to rob
  // using store writeback port

-  val loadWritebackOverride  = Mux(atomicsUnit.io.out.valid, atomicsUnit.io.out.bits, loadUnits.head.io.ldout.bits)
-  val ldOut0 = Wire(Decoupled(new ExuOutput))
-  ldOut0.valid := atomicsUnit.io.out.valid || loadUnits.head.io.ldout.valid
-  ldOut0.bits  := loadWritebackOverride
-  atomicsUnit.io.out.ready := ldOut0.ready
-  loadUnits.head.io.ldout.ready := ldOut0.ready
+  val loadWritebackOverride  = Mux(atomicsUnit.io.out.valid, atomicsUnit.io.out.bits, loadUnits.head.io.loadOut.bits)
+  val loadOut0 = Wire(Decoupled(new ExuOutput))
+  loadOut0.valid := atomicsUnit.io.out.valid || loadUnits.head.io.loadOut.valid
+  loadOut0.bits  := loadWritebackOverride
+  atomicsUnit.io.out.ready := loadOut0.ready
+  loadUnits.head.io.loadOut.ready := loadOut0.ready
  when(atomicsUnit.io.out.valid){
-    ldOut0.bits.uop.cf.exceptionVec := 0.U(16.W).asBools // exception will be writebacked via store wb port
+    loadOut0.bits.uop.cf.exceptionVec := 0.U(16.W).asBools // exception will be writebacked via store wb port
  }

-  val ldExeWbReqs = ldOut0 +: loadUnits.tail.map(_.io.ldout)
+  val ldExeWbReqs = loadOut0 +: loadUnits.tail.map(_.io.loadOut)
  io.writeback <> ldExeWbReqs ++ VecInit(storeUnits.map(_.io.stout)) ++ VecInit(stdExeUnits.map(_.io.out))
  io.otherFastWakeup := DontCare
  io.otherFastWakeup.take(2).zip(loadUnits.map(_.io.fastUop)).foreach{case(a,b)=> a := b}
@@ -203,7 +206,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  loadUnits(0).io.prefetch_req.bits.confidence := 0.U

  l1_pf_req.ready := (l1_pf_req.bits.confidence > 0.U) ||
-    loadUnits.map(!_.io.ldin.valid).reduce(_ || _)
+    loadUnits.map(!_.io.loadIn.valid).reduce(_ || _)

  // l1 pf fuzzer interface
  val DebugEnableL1PFFuzzer = false
@@ -223,7 +226,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  }

  // TODO: fast load wakeup
-  val lsq     = Module(new LsqWrappper)
+  val lsq     = Module(new LsqWrapper)
  val vlsq    = Module(new DummyVectorLsq)
  val sbuffer = Module(new Sbuffer)
  // if you wants to stress test dcache store, use FakeSbuffer
@@ -303,6 +306,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    io.debug_ls.debugLsInfo(i + exuParameters.LduCnt) := storeUnits(i).io.debug_ls
  }

+  io.lsTopdownInfo := loadUnits.map(_.io.lsTopdownInfo)
+
  // pmp
  val pmp = Module(new PMP())
  pmp.io.distribute_csr <> csrCtrl.distribute_csr
@@ -335,12 +340,67 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    PrintTriggerInfo(tEnable(j), tdata(j))

  // LoadUnit
+  class BalanceEntry extends XSBundle {
+    val balance = Bool()
+    val req = new LqWriteBundle
+    val port = UInt(log2Up(LoadPipelineWidth).W)
+  }
+
+  def balanceReOrder(sel: Seq[ValidIO[BalanceEntry]]): Seq[ValidIO[BalanceEntry]] = {
+    require(sel.length > 0)
+    val balancePick = ParallelPriorityMux(sel.map(x => (x.valid && x.bits.balance) -> x))
+    val reorderSel = Wire(Vec(sel.length, ValidIO(new BalanceEntry)))
+    (0 until sel.length).map(i =>
+      if (i == 0) {
+        when (balancePick.valid && balancePick.bits.balance) {
+          reorderSel(i) := balancePick
+        } .otherwise {
+          reorderSel(i) := sel(i)
+        }
+      } else {
+        when (balancePick.valid && balancePick.bits.balance && i.U === balancePick.bits.port) {
+          reorderSel(i) := sel(0)
+        } .otherwise {
+          reorderSel(i) := sel(i)
+        }
+      }
+    )
+    reorderSel
+  }
+
+  val fastReplaySel = loadUnits.zipWithIndex.map { case (ldu, i) => {
+    val wrapper = Wire(Valid(new BalanceEntry))
+    wrapper.valid := ldu.io.fastReplayOut.valid 
+    wrapper.bits.req := ldu.io.fastReplayOut.bits
+    wrapper.bits.balance := ldu.io.fastReplayOut.bits.replayInfo.cause(LoadReplayCauses.bankConflict)
+    wrapper.bits.port := i.U
+    wrapper
+  }}
+  val balanceFastReplaySel = balanceReOrder(fastReplaySel)
+
  for (i <- 0 until exuParameters.LduCnt) {
    loadUnits(i).io.redirect <> redirect
-    loadUnits(i).io.rsIdx := io.rsfeedback(i).rsIdx // DontCare
    loadUnits(i).io.isFirstIssue := true.B
+  
    // get input form dispatch
-    loadUnits(i).io.ldin <> io.issue(i)
+    loadUnits(i).io.loadIn <> io.issue(i)
+    loadUnits(i).io.feedbackSlow <> io.rsfeedback(i).feedbackSlow
+    loadUnits(i).io.feedbackFast <> io.rsfeedback(i).feedbackFast
+    loadUnits(i).io.rsIdx := io.rsfeedback(i).rsIdx
+   
+    // fast replay
+    loadUnits(i).io.fastReplayIn.valid := balanceFastReplaySel(i).valid 
+    loadUnits(i).io.fastReplayIn.bits := balanceFastReplaySel(i).bits.req
+
+    loadUnits(i).io.fastReplayOut.ready := false.B
+    for (j <- 0 until exuParameters.LduCnt) {
+      when (balanceFastReplaySel(j).valid && balanceFastReplaySel(j).bits.port === i.U) {
+        loadUnits(i).io.fastReplayOut.ready := loadUnits(j).io.fastReplayIn.ready
+      }
+    }
+     
+    // get input form dispatch
+    loadUnits(i).io.loadIn <> io.issue(i)
    // dcache access
    loadUnits(i).io.dcache <> dcache.io.lsu.load(i)
    // forward
@@ -349,7 +409,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    loadUnits(i).io.tlDchannel := dcache.io.lsu.forward_D(i)
    loadUnits(i).io.forward_mshr <> dcache.io.lsu.forward_mshr(i)
    // ld-ld violation check
-    loadUnits(i).io.lsq.loadViolationQuery <> lsq.io.loadViolationQuery(i)
+    loadUnits(i).io.lsq.loadLoadViolationQuery <> lsq.io.ldu.loadLoadViolationQuery(i)
+    loadUnits(i).io.lsq.storeLoadViolationQuery <> lsq.io.ldu.storeLoadViolationQuery(i)
    loadUnits(i).io.csrCtrl       <> csrCtrl
    // dcache refill req
    loadUnits(i).io.refill           <> delayedDcacheRefill
@@ -357,10 +418,11 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    loadUnits(i).io.tlb <> dtlb_reqs.take(exuParameters.LduCnt)(i)
    // pmp
    loadUnits(i).io.pmp <> pmp_check(i).resp
-    // st-ld violation query
+    // st-ld violation query 
    for (s <- 0 until StorePipelineWidth) {
      loadUnits(i).io.reExecuteQuery(s) := storeUnits(s).io.reExecuteQuery
    }
+    loadUnits(i).io.lqReplayFull <> lsq.io.lqReplayFull
    // prefetch
    prefetcherOpt.foreach(pf => {
      pf.io.ld_in(i).valid := Mux(pf_train_on_hit,
@@ -383,32 +445,21 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    val fastMatch = ParallelPriorityMux(fastValidVec, fastMatchVec)
    loadUnits(i).io.loadFastMatch := fastMatch
    loadUnits(i).io.loadFastImm := io.loadFastImm(i)
+    loadUnits(i).io.replay <> lsq.io.replay(i)

-    // Lsq to load unit's rs
-
-    // passdown to lsq (load s1)
-    lsq.io.loadPaddrIn(i) <> loadUnits(i).io.lsq.loadPaddrIn
-    lsq.io.loadVaddrIn(i) <> loadUnits(i).io.lsq.loadVaddrIn
-
-    lsq.io.replayFast(i) := loadUnits(i).io.lsq.replayFast
-    lsq.io.replaySlow(i) := loadUnits(i).io.lsq.replaySlow
-
-    loadUnits(i).io.lsqOut       <> lsq.io.loadOut(i)
+    loadUnits(i).io.l2Hint <> io.l2Hint

    // passdown to lsq (load s2)
-    lsq.io.loadIn(i) <> loadUnits(i).io.lsq.loadIn
-    lsq.io.ldout(i) <> loadUnits(i).io.lsq.ldout
+    lsq.io.ldu.loadIn(i) <> loadUnits(i).io.lsq.loadIn
+    lsq.io.loadOut(i) <> loadUnits(i).io.lsq.loadOut
    lsq.io.ldRawDataOut(i) <> loadUnits(i).io.lsq.ldRawData
-    lsq.io.s2_load_data_forwarded(i) <> loadUnits(i).io.lsq.s2_load_data_forwarded
    lsq.io.trigger(i) <> loadUnits(i).io.lsq.trigger

-    // passdown to lsq (load s3)
-    lsq.io.s2_dcache_require_replay(i) <> loadUnits(i).io.lsq.s2_dcache_require_replay
-    lsq.io.s3_replay_from_fetch(i) <> loadUnits(i).io.lsq.s3_replay_from_fetch
-    lsq.io.s3_delayed_load_error(i) <> loadUnits(i).io.s3_delayed_load_error
+    lsq.io.l2Hint.valid := io.l2Hint.valid
+    lsq.io.l2Hint.bits.sourceId := io.l2Hint.bits.sourceId

    // alter writeback exception info
-    io.s3_delayed_load_error(i) := loadUnits(i).io.lsq.s3_delayed_load_error
+    io.s3_delayed_load_error(i) := loadUnits(i).io.s3_delayedLoadError

    // update mem dependency predictor
    // io.memPredUpdate(i) := DontCare
@@ -458,25 +509,25 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    stdExeUnits(i).io.out := DontCare

    stu.io.redirect     <> redirect
-    stu.io.feedbackSlow <> io.rsfeedback(i).feedbackSlow
-    stu.io.rsIdx        <> io.rsfeedback(i).rsIdx
+    stu.io.feedbackSlow <> io.rsfeedback(exuParameters.LduCnt + i).feedbackSlow
+    stu.io.rsIdx        <> io.rsfeedback(exuParameters.LduCnt + i).rsIdx
    // NOTE: just for dtlb's perf cnt
-    stu.io.isFirstIssue <> io.rsfeedback(i).isFirstIssue
+    stu.io.isFirstIssue <> io.rsfeedback(exuParameters.LduCnt + i).isFirstIssue
    stu.io.stin         <> io.issue(exuParameters.LduCnt + i)
-    stu.io.lsq          <> lsq.io.storeIn(i)
-    stu.io.lsq_replenish <> lsq.io.storeInRe(i)
+    stu.io.lsq          <> lsq.io.sta.storeAddrIn(i)
+    stu.io.lsq_replenish <> lsq.io.sta.storeAddrInRe(i)
    // dtlb
    stu.io.tlb          <> dtlb_reqs.drop(exuParameters.LduCnt)(i)
    stu.io.pmp          <> pmp_check(i+exuParameters.LduCnt).resp

    // store unit does not need fast feedback
-    io.rsfeedback(i).feedbackFast := DontCare
+    io.rsfeedback(exuParameters.LduCnt + i).feedbackFast := DontCare

    // Lsq to sta unit
-    lsq.io.storeMaskIn(i) <> stu.io.storeMaskOut
+    lsq.io.sta.storeMaskIn(i) <> stu.io.storeMaskOut

    // Lsq to std unit's rs
-    lsq.io.storeDataIn(i) := stData(i)
+    lsq.io.std.storeDataIn(i) := stData(i)


    // 1. sync issue info to store set LFST
@@ -545,6 +596,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  lsq.io.enq            <> io.enqLsq
  lsq.io.brqRedirect    <> redirect
  io.memoryViolation    <> lsq.io.rollback
+  io.lsqio.lqCanAccept  := lsq.io.lqCanAccept
+  io.lsqio.sqCanAccept  := lsq.io.sqCanAccept
  // lsq.io.uncache        <> uncache.io.lsq
  AddPipelineReg(lsq.io.uncache.req, uncache.io.lsq.req, false.B)
  AddPipelineReg(uncache.io.lsq.resp, lsq.io.uncache.resp, false.B)
@@ -553,11 +606,11 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  lsq.io.release        := dcache.io.lsu.release
  lsq.io.lqCancelCnt <> io.lqCancelCnt
  lsq.io.sqCancelCnt <> io.sqCancelCnt
+  lsq.io.lqDeq <> io.lqDeq
  lsq.io.sqDeq <> io.sqDeq
-
  // LSQ to store buffer
  lsq.io.sbuffer        <> sbuffer.io.in
-  lsq.io.sqempty        <> sbuffer.io.sqempty
+  lsq.io.sqEmpty        <> sbuffer.io.sqempty

  // Sbuffer
  sbuffer.io.csrCtrl    <> csrCtrl
@@ -617,7 +670,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  atomicsUnit.io.storeDataIn.bits  := Mux1H(Seq.tabulate(exuParameters.StuCnt)(i =>
    st_data_atomics(i) -> stData(i).bits))
  atomicsUnit.io.rsIdx    := Mux1H(Seq.tabulate(exuParameters.StuCnt)(i =>
-    st_atomics(i) -> io.rsfeedback(atomic_replay_port_idx(i)).rsIdx))
+    st_atomics(i) -> io.rsfeedback(atomic_rs(i)).rsIdx))
  atomicsUnit.io.redirect <> redirect

  // TODO: complete amo's pmp support
@@ -636,7 +689,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)

  when (state =/= s_normal) {
    // use store wb port instead of load
-    loadUnits(0).io.ldout.ready := false.B
+    loadUnits(0).io.loadOut.ready := false.B
    // use load_0's TLB
    atomicsUnit.io.dtlb <> amoTlb

@@ -644,11 +697,11 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    loadUnits.map(i => i.io.prefetch_req.valid := false.B)

    // make sure there's no in-flight uops in load unit
-    assert(!loadUnits(0).io.ldout.valid)
+    assert(!loadUnits(0).io.loadOut.valid)
  }

  for (i <- 0 until exuParameters.StuCnt) when (state === s_atomics(i)) {
-    atomicsUnit.io.feedbackSlow <> io.rsfeedback(atomic_replay_port_idx(i)).feedbackSlow
+    atomicsUnit.io.feedbackSlow <> io.rsfeedback(atomic_rs(i)).feedbackSlow

    assert(!storeUnits(i).io.feedbackSlow.valid)
  }
@@ -670,9 +723,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  io.memInfo.lqFull := RegNext(lsq.io.lqFull)
  io.memInfo.dcacheMSHRFull := RegNext(dcache.io.mshrFull)

-  io.lqFull := lsq.io.lqFull
-  io.sqFull := lsq.io.sqFull
-
  val ldDeqCount = PopCount(io.issue.take(exuParameters.LduCnt).map(_.valid))
  val stDeqCount = PopCount(io.issue.drop(exuParameters.LduCnt).map(_.valid))
  val rsDeqCount = ldDeqCount + stDeqCount

--- a/src/main/scala/xiangshan/backend/Scheduler.scala
+++ b/src/main/scala/xiangshan/backend/Scheduler.scala
@@ -30,6 +30,7 @@ import xiangshan.backend.fu.fpu.FMAMidResultIO
 import xiangshan.backend.issue.ReservationStationWrapper
 import xiangshan.backend.regfile.{Regfile, RfReadPort}
 import xiangshan.backend.rename.{BusyTable, BusyTableReadIO}
+import xiangshan.backend.rob.RobPtr
 import xiangshan.mem.{LsqEnqCtrl, LsqEnqIO, MemWaitUpdateReq, SqPtr}
 import chisel3.ExcitingUtils

@@ -263,16 +264,15 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
    val lcommit = Input(UInt(log2Up(CommitWidth + 1).W))
    val scommit = Input(UInt(log2Ceil(EnsbufferWidth + 1).W)) // connected to `memBlock.io.sqDeq` instead of ROB
    // from lsq
-    val lqCancelCnt = Input(UInt(log2Up(LoadQueueSize + 1).W))
+    val lqCancelCnt = Input(UInt(log2Up(VirtualLoadQueueSize + 1).W))
    val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W))
    val memWaitUpdateReq = Flipped(new MemWaitUpdateReq)
    // debug
    val debug_int_rat = Vec(32, Input(UInt(PhyRegIdxWidth.W)))
    val debug_fp_rat = Vec(32, Input(UInt(PhyRegIdxWidth.W)))
    // perf
-    val sqFull = Input(Bool())
-    val lqFull = Input(Bool())
-
+    val robDeqPtr = Input(new RobPtr)
+    val robHeadLsIssue = Output(Bool())
  }

  val numFma = outer.reservationStations.map(_.module.io.fmaMid.getOrElse(Seq()).length).sum
@@ -525,6 +525,9 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
    }
  }

+  val lsRsDeqPorts = outer.reservationStations.filter(_.params.lsqFeedback).map(_.module.io.deq).flatten
+  io.extra.robHeadLsIssue := lsRsDeqPorts.map(deq => deq.fire && deq.bits.uop.robIdx === io.extra.robDeqPtr).reduceOption(_ || _).getOrElse(false.B)
+
  if ((env.AlwaysBasicDiff || env.EnableDifftest) && intRfConfig._1) {
    val difftest = Module(new DifftestArchIntRegState)
    difftest.io.clock := clock
@@ -543,20 +546,6 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
  XSPerfAccumulate("issue_valid", PopCount(io.issue.map(_.valid)))
  XSPerfAccumulate("issue_fire", PopCount(io.issue.map(_.fire)))

-  if (env.EnableTopDown && rs_all.exists(_.params.isLoad)) {
-    val stall_ls_dq = WireDefault(0.B)
-    ExcitingUtils.addSink(stall_ls_dq, "stall_ls_dq", ExcitingUtils.Perf)
-    val ld_rs_full = !rs_all.filter(_.params.isLoad).map(_.module.io.fromDispatch.map(_.ready).reduce(_ && _)).reduce(_ && _)
-    val st_rs_full = !rs_all.filter(rs => rs.params.isStore || rs.params.isStoreData).map(_.module.io.fromDispatch.map(_.ready).reduce(_ && _)).reduce(_ && _)
-    val stall_stores_bound = stall_ls_dq && (st_rs_full || io.extra.sqFull)
-    val stall_loads_bound = stall_ls_dq && (ld_rs_full || io.extra.lqFull)
-    val stall_ls_bandwidth_bound = stall_ls_dq && !(st_rs_full || io.extra.sqFull) && !(ld_rs_full || io.extra.lqFull)
-    ExcitingUtils.addSource(stall_loads_bound, "stall_loads_bound", ExcitingUtils.Perf)
-    XSPerfAccumulate("stall_loads_bound", stall_loads_bound)
-    XSPerfAccumulate("stall_stores_bound", stall_stores_bound)
-    XSPerfAccumulate("stall_ls_bandwidth_bound", stall_ls_bandwidth_bound)
-  }
-
  val lastCycleAllocate = RegNext(VecInit(allocate.map(_.fire)))
  val lastCycleIssue = RegNext(VecInit(io.issue.map(_.fire)))
  val schedulerPerf = Seq(

--- a/src/main/scala/xiangshan/backend/decode/DecodeStage.scala
+++ b/src/main/scala/xiangshan/backend/decode/DecodeStage.scala
@@ -37,6 +37,10 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {
    val csrCtrl = Input(new CustomCSRCtrlIO)
    // perf only
    val fusion = Vec(DecodeWidth - 1, Input(Bool()))
+    val stallReason = new Bundle {
+      val in = Flipped(new StallReasonIO(DecodeWidth))
+      val out = new StallReasonIO(DecodeWidth)
+    }
  })

  val decoders = Seq.fill(DecodeWidth)(Module(new DecodeUnit))
@@ -72,16 +76,20 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {

  debug_globalCounter := debug_globalCounter + PopCount(io.out.map(_.fire))

+  io.stallReason.in.backReason := io.stallReason.out.backReason
+  io.stallReason.out.reason.zip(io.stallReason.in.reason).zip(io.in.map(_.valid)).foreach { case ((out, in), valid) =>
+    out := Mux(io.stallReason.out.backReason.valid,
+               io.stallReason.out.backReason.bits,
+               Mux(valid, TopDownCounters.NoStall.id.U, in))
+  }
+
  XSPerfAccumulate("utilization", PopCount(io.in.map(_.valid)))
  XSPerfAccumulate("waitInstr", PopCount((0 until DecodeWidth).map(i => io.in(i).valid && !io.in(i).ready)))
  XSPerfAccumulate("stall_cycle", hasValid && !io.out(0).ready)

-  if (env.EnableTopDown) {
-    XSPerfAccumulate("slots_issued", PopCount(io.out.map(_.fire)))
-    XSPerfAccumulate("decode_bubbles", PopCount(io.out.map(x => !x.valid && x.ready))) // Unutilized issue-pipeline slots while there is no backend-stall
-    XSPerfAccumulate("fetch_bubbles", PopCount((0 until DecodeWidth).map(i => !io.in(i).valid && io.in(i).ready))) //slots
-    XSPerfAccumulate("ifu2id_allNO_cycle", VecInit((0 until DecodeWidth).map(i => !io.in(i).valid && io.in(i).ready)).asUInt.andR)
-  }
+  XSPerfHistogram("slots_fire", PopCount(io.out.map(_.fire)), true.B, 0, DecodeWidth+1, 1)
+  XSPerfHistogram("slots_valid_pure", PopCount(io.in.map(_.valid)), io.out(0).fire, 0, DecodeWidth+1, 1)
+  XSPerfHistogram("slots_valid_rough", PopCount(io.in.map(_.valid)), true.B, 0, DecodeWidth+1, 1)

  val fusionValid = RegNext(io.fusion)
  val inFire = io.in.map(in => RegNext(in.valid && !in.ready))

--- a/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala
+++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala
@@ -70,6 +70,13 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
    val singleStep = Input(Bool())
    // lfst
    val lfst = new DispatchLFSTIO
+    // perf only
+    val robHead = Input(new MicroOp)
+    val stallReason = Flipped(new StallReasonIO(RenameWidth))
+    val lqCanAccept = Input(Bool())
+    val sqCanAccept = Input(Bool())
+    val robHeadNotReady = Input(Bool())
+    val robFull = Input(Bool())
  })

  /**
@@ -105,7 +112,7 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
  val updatedUop = Wire(Vec(RenameWidth, new MicroOp))
  val updatedCommitType = Wire(Vec(RenameWidth, CommitType()))
  val checkpoint_id = RegInit(0.U(64.W))
-  checkpoint_id := checkpoint_id + PopCount((0 until RenameWidth).map(i => 
+  checkpoint_id := checkpoint_id + PopCount((0 until RenameWidth).map(i =>
    io.fromRename(i).fire()
  ))

@@ -151,7 +158,7 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
      if(i == 0){
        debug_runahead_checkpoint_id := checkpoint_id
      } else {
-        debug_runahead_checkpoint_id := checkpoint_id + PopCount((0 until i).map(i => 
+        debug_runahead_checkpoint_id := checkpoint_id + PopCount((0 until i).map(i =>
          io.fromRename(i).fire()
        ))
      }
@@ -204,6 +211,7 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
  // (1) resources are ready
  // (2) previous instructions are ready
  val thisCanActualOut = (0 until RenameWidth).map(i => !thisIsBlocked(i) && notBlockedByPrevious(i))
+  val thisActualOut = (0 until RenameWidth).map(i => io.enqRob.req(i).valid && io.enqRob.canAccept)
  val hasValidException = io.fromRename.zip(hasException).map(x => x._1.valid && x._2)

  // input for ROB, LSQ, Dispatch Queue
@@ -264,31 +272,92 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
    PopCount(io.toLsDq.req.map(_.valid && io.toLsDq.canAccept))
  XSError(enqFireCnt > renameFireCnt, "enqFireCnt should not be greater than renameFireCnt\n")

+  val stall_rob = hasValidInstr && !io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept
+  val stall_int_dq = hasValidInstr && io.enqRob.canAccept && !io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept
+  val stall_fp_dq = hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && !io.toFpDq.canAccept && io.toLsDq.canAccept
+  val stall_ls_dq = hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && !io.toLsDq.canAccept
  XSPerfAccumulate("in", Mux(RegNext(io.fromRename(0).ready), PopCount(io.fromRename.map(_.valid)), 0.U))
  XSPerfAccumulate("empty", !hasValidInstr)
  XSPerfAccumulate("utilization", PopCount(io.fromRename.map(_.valid)))
  XSPerfAccumulate("waitInstr", PopCount((0 until RenameWidth).map(i => io.fromRename(i).valid && !io.recv(i))))
-  XSPerfAccumulate("stall_cycle_rob", hasValidInstr && !io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept)
-  XSPerfAccumulate("stall_cycle_int_dq", hasValidInstr && io.enqRob.canAccept && !io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept)
-  XSPerfAccumulate("stall_cycle_fp_dq", hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && !io.toFpDq.canAccept && io.toLsDq.canAccept)
-  XSPerfAccumulate("stall_cycle_ls_dq", hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && !io.toLsDq.canAccept)
-
-  if (env.EnableTopDown) {
-    val stall_ls_dq = hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && !io.toLsDq.canAccept
-    ExcitingUtils.addSource(stall_ls_dq, "stall_ls_dq", ExcitingUtils.Perf)
-    // TODO: we may need finer counters to count responding slots more precisely, i.e. per-slot granularity.
+  XSPerfAccumulate("stall_cycle_rob", stall_rob)
+  XSPerfAccumulate("stall_cycle_int_dq", stall_int_dq)
+  XSPerfAccumulate("stall_cycle_fp_dq", stall_fp_dq)
+  XSPerfAccumulate("stall_cycle_ls_dq", stall_ls_dq)
+
+  val Seq(notIssue, tlbReplay, tlbMiss, vioReplay, mshrReplay, l1Miss, l2Miss, l3Miss) =
+    Seq.fill(8)(WireDefault(false.B))
+  ExcitingUtils.addSink(notIssue, s"rob_head_ls_issue_${coreParams.HartId}", ExcitingUtils.Perf)
+  ExcitingUtils.addSink(tlbReplay, s"load_tlb_replay_stall_${coreParams.HartId}", ExcitingUtils.Perf)
+  ExcitingUtils.addSink(tlbMiss, s"load_tlb_miss_stall_${coreParams.HartId}", ExcitingUtils.Perf)
+  ExcitingUtils.addSink(vioReplay, s"load_vio_replay_stall_${coreParams.HartId}", ExcitingUtils.Perf)
+  ExcitingUtils.addSink(mshrReplay, s"load_mshr_replay_stall_${coreParams.HartId}", ExcitingUtils.Perf)
+  ExcitingUtils.addSink(l1Miss, s"load_l1_miss_${coreParams.HartId}", ExcitingUtils.Perf)
+  ExcitingUtils.addSink(l2Miss, s"L2MissMatch_${coreParams.HartId}", ExcitingUtils.Perf)
+  ExcitingUtils.addSink(l3Miss, s"L3MissMatch_${coreParams.HartId}", ExcitingUtils.Perf)
+
+  val ldReason = Mux(l3Miss, TopDownCounters.LoadMemStall.id.U,
+  Mux(l2Miss, TopDownCounters.LoadL3Stall.id.U,
+  Mux(l1Miss, TopDownCounters.LoadL2Stall.id.U,
+  Mux(notIssue, TopDownCounters.MemNotReadyStall.id.U,
+  Mux(tlbMiss, TopDownCounters.LoadTLBStall.id.U,
+  Mux(tlbReplay, TopDownCounters.LoadTLBStall.id.U,
+  Mux(mshrReplay, TopDownCounters.LoadMSHRReplayStall.id.U,
+  Mux(vioReplay, TopDownCounters.LoadVioReplayStall.id.U,
+  TopDownCounters.LoadL1Stall.id.U))))))))
+
+  val stallReason = Wire(chiselTypeOf(io.stallReason.reason))
+  val realFired = io.recv.zip(io.fromRename.map(_.valid)).map(x => x._1 && x._2)
+  io.stallReason.backReason.valid := !io.recv.head
+  io.stallReason.backReason.bits := TopDownCounters.OtherCoreStall.id.U
+  stallReason.zip(io.stallReason.reason).zip(io.recv).zip(realFired).map { case (((update, in), recv), fire) =>
+    import FuType._
+    val headIsInt = isIntExu(io.robHead.ctrl.fuType)  && io.robHeadNotReady
+    val headIsFp  = isFpExu(io.robHead.ctrl.fuType)   && io.robHeadNotReady
+    val headIsDiv = isDivSqrt(io.robHead.ctrl.fuType) && io.robHeadNotReady
+    val headIsLd  = io.robHead.ctrl.fuType === ldu && io.robHeadNotReady || !io.lqCanAccept
+    val headIsSt  = io.robHead.ctrl.fuType === stu && io.robHeadNotReady || !io.sqCanAccept
+    val headIsAmo = io.robHead.ctrl.fuType === mou && io.robHeadNotReady
+    val headIsLs  = headIsLd || headIsSt
+    val robLsFull = io.robFull || !io.lqCanAccept || !io.sqCanAccept
+
+    import TopDownCounters._
+    update := MuxCase(OtherCoreStall.id.U, Seq(
+      // fire
+      (fire                                              ) -> NoStall.id.U          ,
+      // dispatch not stall / core stall from rename
+      (in =/= OtherCoreStall.id.U                        ) -> in                    ,
+      // dispatch queue stall
+      (!io.toIntDq.canAccept && !headIsInt && !io.robFull) -> IntDqStall.id.U       ,
+      (!io.toFpDq.canAccept  && !headIsFp  && !io.robFull) -> FpDqStall.id.U        ,
+      (!io.toLsDq.canAccept  && !headIsLs  && !robLsFull ) -> LsDqStall.id.U        ,
+      // rob stall
+      (headIsAmo                                         ) -> AtomicStall.id.U      ,
+      (headIsSt                                          ) -> StoreStall.id.U       ,
+      (headIsLd                                          ) -> ldReason              ,
+      (headIsDiv                                         ) -> DivStall.id.U         ,
+      (headIsInt                                         ) -> IntNotReadyStall.id.U ,
+      (headIsFp                                          ) -> FPNotReadyStall.id.U  ,
+    ))
  }

+  TopDownCounters.values.foreach(ctr => XSPerfAccumulate(ctr.toString(), PopCount(stallReason.map(_ === ctr.id.U))))
+
+  XSPerfHistogram("slots_fire", PopCount(thisActualOut), true.B, 0, RenameWidth+1, 1)
+  // Explaination: when out(0) not fire, PopCount(valid) is not meaningfull
+  XSPerfHistogram("slots_valid_pure", PopCount(io.enqRob.req.map(_.valid)), thisActualOut(0), 0, RenameWidth+1, 1)
+  XSPerfHistogram("slots_valid_rough", PopCount(io.enqRob.req.map(_.valid)), true.B, 0, RenameWidth+1, 1)
+
  val perfEvents = Seq(
-    ("dispatch_in",                 PopCount(io.fromRename.map(_.valid & io.fromRename(0).ready))                                              ),
-    ("dispatch_empty",              !hasValidInstr                                                                                             ),
-    ("dispatch_utili",              PopCount(io.fromRename.map(_.valid))                                                                       ),
-    ("dispatch_waitinstr",          PopCount((0 until RenameWidth).map(i => io.fromRename(i).valid && !io.recv(i)))                            ),
-    ("dispatch_stall_cycle_lsq",    false.B                                                                                                    ),
-    ("dispatch_stall_cycle_rob",    hasValidInstr && !io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept),
-    ("dispatch_stall_cycle_int_dq", hasValidInstr && io.enqRob.canAccept && !io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept),
-    ("dispatch_stall_cycle_fp_dq",  hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && !io.toFpDq.canAccept && io.toLsDq.canAccept),
-    ("dispatch_stall_cycle_ls_dq",  hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && !io.toLsDq.canAccept)
+    ("dispatch_in",                 PopCount(io.fromRename.map(_.valid & io.fromRename(0).ready))                  ),
+    ("dispatch_empty",              !hasValidInstr                                                                 ),
+    ("dispatch_utili",              PopCount(io.fromRename.map(_.valid))                                           ),
+    ("dispatch_waitinstr",          PopCount((0 until RenameWidth).map(i => io.fromRename(i).valid && !io.recv(i)))),
+    ("dispatch_stall_cycle_lsq",    false.B                                                                        ),
+    ("dispatch_stall_cycle_rob",    stall_rob                                                                      ),
+    ("dispatch_stall_cycle_int_dq", stall_int_dq                                                                   ),
+    ("dispatch_stall_cycle_fp_dq",  stall_fp_dq                                                                    ),
+    ("dispatch_stall_cycle_ls_dq",  stall_ls_dq                                                                    )
  )
  generatePerfEvent()
 }
--- a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala
+++ b/src/main/scala/xiangshan/backend/issue/ReservationStation.scala
@@ -98,10 +98,8 @@ class ReservationStationWrapper(implicit p: Parameters) extends LazyModule with
    }
    if (cfg == StaExeUnitCfg || cfg == LdExeUnitCfg) {
      params.lsqFeedback = true
-      params.checkWaitBit = true
-    }
-    if(cfg == StaExeUnitCfg) {
      params.hasFeedback = true
+      params.checkWaitBit = false
    }
    if (cfg.hasCertainLatency) {
      params.fixedLatency = if (cfg == MulDivExeUnitCfg) mulCfg.latency.latencyVal.get else cfg.latency.latencyVal.get
@@ -939,21 +937,6 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
    }
  }

-  if (env.EnableTopDown && params.isLoad) {
-    val l1d_loads_bound = WireDefault(0.B)
-    ExcitingUtils.addSink(l1d_loads_bound, "l1d_loads_bound", ExcitingUtils.Perf)
-    val mshrFull = statusArray.io.rsFeedback(RSFeedbackType.mshrFull.litValue.toInt)
-    val tlbMiss = !mshrFull && statusArray.io.rsFeedback(RSFeedbackType.tlbMiss.litValue.toInt)
-    val dataInvalid = !mshrFull && !tlbMiss && statusArray.io.rsFeedback(RSFeedbackType.dataInvalid.litValue.toInt)
-    val bankConflict = !mshrFull && !tlbMiss && !dataInvalid && statusArray.io.rsFeedback(RSFeedbackType.bankConflict.litValue.toInt)
-    val ldVioCheckRedo = !mshrFull && !tlbMiss && !dataInvalid && !bankConflict && statusArray.io.rsFeedback(RSFeedbackType.ldVioCheckRedo.litValue.toInt)
-    XSPerfAccumulate("l1d_loads_mshr_bound", l1d_loads_bound && mshrFull)
-    XSPerfAccumulate("l1d_loads_tlb_bound", l1d_loads_bound && tlbMiss)
-    XSPerfAccumulate("l1d_loads_store_data_bound", l1d_loads_bound && dataInvalid)
-    XSPerfAccumulate("l1d_loads_bank_conflict_bound", l1d_loads_bound && bankConflict)
-    XSPerfAccumulate("l1d_loads_vio_check_redo_bound", l1d_loads_bound && ldVioCheckRedo)
-  }
-
  XSPerfAccumulate("redirect_num", io.redirect.valid)
  XSPerfAccumulate("allocate_num", PopCount(s0_doEnqueue))
  XSPerfHistogram("issue_num", PopCount(io.deq.map(_.valid)), true.B, 0, params.numDeq, 1)

--- a/src/main/scala/xiangshan/backend/issue/StatusArray.scala
+++ b/src/main/scala/xiangshan/backend/issue/StatusArray.scala
@@ -99,7 +99,7 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
    val stIssuePtr = if (params.checkWaitBit) Input(new SqPtr()) else null
    val memWaitUpdateReq = if (params.checkWaitBit) Flipped(new MemWaitUpdateReq) else null

-    val rsFeedback = Output(Vec(5, Bool()))
+    val rsFeedback = Output(Vec(RSFeedbackType.allTypes, Bool()))
  })

  val statusArrayValid = RegInit(VecInit(Seq.fill(params.numEntries)(false.B)))
@@ -112,7 +112,7 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
  val replayArrayNext = WireInit(replayArray)
  replayArray := replayArrayNext
  (statusArrayValid zip replayArrayNext).foreach { case (valid, replay) => when(valid === 0.B) { replay := RSFeedbackType.feedbackInvalid } }
-  io.rsFeedback := VecInit((0 until 5).map(index => statusArrayValid.zip(replayArray).map {
+  io.rsFeedback := VecInit((0 until RSFeedbackType.allTypes).map(index => statusArrayValid.zip(replayArray).map {
    case (valid, replay) => valid && replay === index.U
  }.reduce(_ || _)))


--- a/src/main/scala/xiangshan/backend/rename/Rename.scala
+++ b/src/main/scala/xiangshan/backend/rename/Rename.scala
@@ -48,6 +48,11 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe
    // debug arch ports
    val debug_int_rat = Vec(32, Input(UInt(PhyRegIdxWidth.W)))
    val debug_fp_rat = Vec(32, Input(UInt(PhyRegIdxWidth.W)))
+    // perf only
+    val stallReason = new Bundle {
+      val in = Flipped(new StallReasonIO(RenameWidth))
+      val out = new StallReasonIO(RenameWidth)
+    }
  })

  // create free list and rat
@@ -352,6 +357,39 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe
    printRenameInfo(x, y)
  }

+  val debugRedirect = RegEnable(io.redirect.bits, io.redirect.valid)
+  // bad speculation
+  val recStall = io.redirect.valid || io.robCommits.isWalk
+  val ctrlRecStall = Mux(io.redirect.valid, io.redirect.bits.debugIsCtrl, io.robCommits.isWalk && debugRedirect.debugIsCtrl)
+  val mvioRecStall = Mux(io.redirect.valid, io.redirect.bits.debugIsMemVio, io.robCommits.isWalk && debugRedirect.debugIsMemVio)
+  val otherRecStall = recStall && !(ctrlRecStall || mvioRecStall)
+  XSPerfAccumulate("recovery_stall", recStall)
+  XSPerfAccumulate("control_recovery_stall", ctrlRecStall)
+  XSPerfAccumulate("mem_violation_recovery_stall", mvioRecStall)
+  XSPerfAccumulate("other_recovery_stall", otherRecStall)
+  // freelist stall
+  val notRecStall = !io.out.head.valid && !recStall
+  val intFlStall = notRecStall && hasValid && !intFreeList.io.canAllocate
+  val fpFlStall = notRecStall && hasValid && !fpFreeList.io.canAllocate
+  // other stall
+  val otherStall = notRecStall && !intFlStall && !fpFlStall
+
+  io.stallReason.in.backReason.valid := io.stallReason.out.backReason.valid || !io.in.head.ready
+  io.stallReason.in.backReason.bits := Mux(io.stallReason.out.backReason.valid, io.stallReason.out.backReason.bits,
+    MuxCase(TopDownCounters.OtherCoreStall.id.U, Seq(
+      ctrlRecStall  -> TopDownCounters.ControlRecoveryStall.id.U,
+      mvioRecStall  -> TopDownCounters.MemVioRecoveryStall.id.U,
+      otherRecStall -> TopDownCounters.OtherRecoveryStall.id.U,
+      intFlStall    -> TopDownCounters.IntFlStall.id.U,
+      fpFlStall     -> TopDownCounters.FpFlStall.id.U
+    )
+  ))
+  io.stallReason.out.reason.zip(io.stallReason.in.reason).zip(io.in.map(_.valid)).foreach { case ((out, in), valid) =>
+    out := Mux(io.stallReason.in.backReason.valid,
+               io.stallReason.in.backReason.bits,
+               Mux(valid, TopDownCounters.NoStall.id.U, in))
+  }
+
  XSDebug(io.robCommits.isWalk, p"Walk Recovery Enabled\n")
  XSDebug(io.robCommits.isWalk, p"validVec:${Binary(io.robCommits.walkValid.asUInt)}\n")
  for (i <- 0 until CommitWidth) {
@@ -370,13 +408,17 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe
  XSPerfAccumulate("stall_cycle_fp", hasValid && io.out(0).ready && !fpFreeList.io.canAllocate && intFreeList.io.canAllocate && !io.robCommits.isWalk)
  XSPerfAccumulate("stall_cycle_int", hasValid && io.out(0).ready && fpFreeList.io.canAllocate && !intFreeList.io.canAllocate && !io.robCommits.isWalk)
  XSPerfAccumulate("stall_cycle_walk", hasValid && io.out(0).ready && fpFreeList.io.canAllocate && intFreeList.io.canAllocate && io.robCommits.isWalk)
-  XSPerfAccumulate("recovery_bubbles", PopCount(io.in.map(_.valid && io.out(0).ready && fpFreeList.io.canAllocate && intFreeList.io.canAllocate && io.robCommits.isWalk)))
+
+  XSPerfHistogram("slots_fire", PopCount(io.out.map(_.fire)), true.B, 0, RenameWidth+1, 1)
+  // Explaination: when out(0) not fire, PopCount(valid) is not meaningfull
+  XSPerfHistogram("slots_valid_pure", PopCount(io.in.map(_.valid)), io.out(0).fire, 0, RenameWidth+1, 1)
+  XSPerfHistogram("slots_valid_rough", PopCount(io.in.map(_.valid)), true.B, 0, RenameWidth+1, 1)

  XSPerfAccumulate("move_instr_count", PopCount(io.out.map(out => out.fire && out.bits.ctrl.isMove)))
  val is_fused_lui_load = io.out.map(o => o.fire && o.bits.ctrl.fuType === FuType.ldu && o.bits.ctrl.srcType(0) === SrcType.imm)
  XSPerfAccumulate("fused_lui_load_instr_count", PopCount(is_fused_lui_load))

-  
+
  val renamePerf = Seq(
    ("rename_in                  ", PopCount(io.in.map(_.valid & io.in(0).ready ))                                                               ),
    ("rename_waitinstr           ", PopCount((0 until RenameWidth).map(i => io.in(i).valid && !io.in(i).ready))                                  ),

--- a/src/main/scala/xiangshan/backend/rob/Rob.scala
+++ b/src/main/scala/xiangshan/backend/rob/Rob.scala
@@ -26,14 +26,15 @@ import utility._
 import xiangshan._
 import xiangshan.backend.exu.ExuConfig
 import xiangshan.frontend.FtqPtr
+import xiangshan.mem.{LsqEnqIO, LqPtr}

 class DebugMdpInfo(implicit p: Parameters) extends XSBundle{
  val ssid = UInt(SSIDWidth.W)
  val waitAllStore = Bool()
 }

-class DebugLsInfo(implicit p: Parameters) extends XSBundle{
-  val s1 = new Bundle{
+class DebugLsInfo(implicit p: Parameters) extends XSBundle {
+  val s1 = new Bundle {
    val isTlbFirstMiss = Bool() // in s1
    val isBankConflict = Bool() // in s1
    val isLoadToLoadForward = Bool()
@@ -70,7 +71,7 @@ class DebugLsInfo(implicit p: Parameters) extends XSBundle{
  }

 }
-object DebugLsInfo{
+object DebugLsInfo {
  def init(implicit p: Parameters): DebugLsInfo = {
    val lsInfo = Wire(new DebugLsInfo)
    lsInfo.s1.isTlbFirstMiss := false.B
@@ -96,22 +97,35 @@ class DebugLSIO(implicit p: Parameters) extends XSBundle {
  val debugLsInfo = Vec(exuParameters.LduCnt + exuParameters.StuCnt, Output(new DebugLsInfoBundle))
 }

-class DebugInstDB(implicit p: Parameters) extends XSBundle{
-  val globalID = UInt(XLEN.W)
-  val robIdx = UInt(log2Ceil(RobSize).W)
-  val instType = FuType()
-  val exceptType = ExceptionVec()
-  val ivaddr = UInt(VAddrBits.W)
-  val dvaddr = UInt(VAddrBits.W) // the l/s access address
-  val dpaddr = UInt(VAddrBits.W) // need the physical address when the TLB is valid
-  val tlbLatency = UInt(XLEN.W)  // original requirements is L1toL2TlbLatency
-  // val levelTlbHit = UInt(2.W) // 01, 10, 11(memory)
-  // val otherPerfNoteThing // FIXME: how much?
-  val accessLatency = UInt(XLEN.W)  // RS out time --> write back time
-  val executeLatency = UInt(XLEN.W)
-  val issueLatency = UInt(XLEN.W)
-  val lsInfo = new DebugLsInfo
-  val mdpInfo = new DebugMdpInfo
+class LsTopdownInfo(implicit p: Parameters) extends XSBundle {
+  val s1 = new Bundle {
+    val robIdx = UInt(log2Ceil(RobSize).W)
+    val vaddr_valid = Bool()
+    val vaddr_bits = UInt(VAddrBits.W)
+  }
+  val s2 = new Bundle {
+    val robIdx = UInt(log2Ceil(RobSize).W)
+    val paddr_valid = Bool()
+    val paddr_bits = UInt(PAddrBits.W)
+  }
+
+  def s1SignalEnable(ena: LsTopdownInfo) = {
+    when(ena.s1.vaddr_valid) {
+      s1.vaddr_valid := true.B
+      s1.vaddr_bits := ena.s1.vaddr_bits
+    }
+  }
+
+  def s2SignalEnable(ena: LsTopdownInfo) = {
+    when(ena.s2.paddr_valid) {
+      s2.paddr_valid := true.B
+      s2.paddr_bits := ena.s2.paddr_bits
+    }
+  }
+}
+
+object LsTopdownInfo {
+  def init(implicit p: Parameters): LsTopdownInfo = 0.U.asTypeOf(new LsTopdownInfo)
 }

 class RobPtr(implicit p: Parameters) extends CircularQueuePtr[RobPtr](
@@ -154,6 +168,10 @@ class RobLsqIO(implicit p: Parameters) extends XSBundle {
  val pendingld = Output(Bool())
  val pendingst = Output(Bool())
  val commit = Output(Bool())
+  val pendingPtr = Output(new RobPtr)
+
+  val mmio = Input(Vec(LoadPipelineWidth, Bool()))
+  val uop = Input(Vec(LoadPipelineWidth, new MicroOp))
 }

 class RobEnqIO(implicit p: Parameters) extends XSBundle {
@@ -165,8 +183,6 @@ class RobEnqIO(implicit p: Parameters) extends XSBundle {
  val resp = Vec(RenameWidth, Output(new RobPtr))
 }

-class RobDispatchData(implicit p: Parameters) extends RobCommitInfo
-
 class RobDeqPtrWrapper(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper {
  val io = IO(new Bundle {
    // for commits/flush
@@ -395,9 +411,14 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
    val robDeqPtr = Output(new RobPtr)
    val csr = new RobCSRIO
    val robFull = Output(Bool())
+    val headNotReady = Output(Bool())
    val cpu_halt = Output(Bool())
    val wfi_enable = Input(Bool())
    val debug_ls = Flipped(new DebugLSIO)
+    val debugRobHead = Output(new MicroOp)
+    val debugEnqLsq = Input(new LsqEnqIO)
+    val debugHeadLsIssue = Input(Bool())
+    val lsTopdownInfo = Vec(exuParameters.LduCnt, Input(new LsTopdownInfo))
  })

  def selectWb(index: Int, func: Seq[ExuConfig] => Boolean): Seq[(Seq[ExuConfig], ValidIO[ExuOutput])] = {
@@ -424,6 +445,7 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
  // writeback status
  val writebacked = Mem(RobSize, Bool())
  val store_data_writebacked = Mem(RobSize, Bool())
+  val mmio = RegInit(VecInit(Seq.fill(RobSize)(false.B)))
  // data for redirect, exception, etc.
  val flagBkup = Mem(RobSize, Bool())
  // some instructions are not allowed to trigger interrupts
@@ -436,6 +458,9 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
  val debug_exuData = Reg(Vec(RobSize, UInt(XLEN.W)))//for debug
  val debug_exuDebug = Reg(Vec(RobSize, new DebugBundle))//for debug
  val debug_lsInfo = RegInit(VecInit(Seq.fill(RobSize)(DebugLsInfo.init)))
+  val debug_lsTopdownInfo = RegInit(VecInit(Seq.fill(RobSize)(LsTopdownInfo.init)))
+  val debug_lqIdxValid = RegInit(VecInit.fill(RobSize)(false.B))
+  val debug_lsIssued = RegInit(VecInit.fill(RobSize)(false.B))

  // pointers
  // For enqueue ptr, we don't duplicate it since only enqueue needs it.
@@ -452,6 +477,9 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
  val isEmpty = enqPtr === deqPtr
  val isReplaying = io.redirect.valid && RedirectLevel.flushItself(io.redirect.bits.level)

+  val debug_lsIssue = WireDefault(debug_lsIssued)
+  debug_lsIssue(deqPtr.value) := io.debugHeadLsIssue
+
  /**
    * states of Rob
    */
@@ -469,7 +497,7 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
    * (1) read: commits/walk/exception
    * (2) write: write back from exe units
    */
-  val dispatchData = Module(new SyncDataModuleTemplate(new RobDispatchData, RobSize, CommitWidth, RenameWidth))
+  val dispatchData = Module(new SyncDataModuleTemplate(new RobCommitInfo, RobSize, CommitWidth, RenameWidth))
  val dispatchDataRead = dispatchData.io.rdata

  val exceptionGen = Module(new ExceptionGen)
@@ -477,6 +505,7 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
  val fflagsDataRead = Wire(Vec(CommitWidth, UInt(5.W)))

  io.robDeqPtr := deqPtr
+  io.debugRobHead := debug_microOp(deqPtr.value)

  /**
    * Enqueue (from dispatch)
@@ -528,6 +557,9 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
      debug_microOp(enqIndex).debugInfo.tlbFirstReqTime := timer
      debug_microOp(enqIndex).debugInfo.tlbRespTime := timer
      debug_lsInfo(enqIndex) := DebugLsInfo.init
+      debug_lsTopdownInfo(enqIndex) := LsTopdownInfo.init
+      debug_lqIdxValid(enqIndex) := false.B
+      debug_lsIssued(enqIndex) := false.B
      when (enqUop.ctrl.blockBackward) {
        hasBlockBackward := true.B
      }
@@ -552,6 +584,8 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
      when (enqUop.ctrl.isWFI && !enqHasException && !enqHasTriggerHit) {
        hasWFI := true.B
      }
+
+      mmio(enqIndex) := false.B
    }
  }
  val dispatchNum = Mux(io.enq.canAccept, PopCount(io.enq.req.map(_.valid)), 0.U)
@@ -561,6 +595,19 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
    hasWFI := false.B
  }

+  // lqEnq
+  io.debugEnqLsq.needAlloc.map(_(0)).zip(io.debugEnqLsq.req).foreach { case (alloc, req) =>
+    when(io.debugEnqLsq.canAccept && alloc && req.valid) {
+      debug_microOp(req.bits.robIdx.value).lqIdx := req.bits.lqIdx
+      debug_lqIdxValid(req.bits.robIdx.value) := true.B
+    }
+  }
+
+  // lsIssue
+  when(io.debugHeadLsIssue) {
+    debug_lsIssued(deqPtr.value) := true.B
+  }
+
  /**
    * Writeback (from execution units)
    */
@@ -591,6 +638,11 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
  val writebackNum = PopCount(exuWriteback.map(_.valid))
  XSInfo(writebackNum =/= 0.U, "writebacked %d insts\n", writebackNum)

+  for (i <- 0 until LoadPipelineWidth) {
+    when (RegNext(io.lsq.mmio(i))) {
+      mmio(RegNext(io.lsq.uop(i).robIdx).value) := true.B
+    }
+  }

  /**
    * RedirectOut: Interrupt and Exceptions
@@ -736,9 +788,10 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
  io.lsq.lcommit := RegNext(Mux(io.commits.isCommit, PopCount(ldCommitVec), 0.U))
  io.lsq.scommit := RegNext(Mux(io.commits.isCommit, PopCount(stCommitVec), 0.U))
  // indicate a pending load or store
-  io.lsq.pendingld := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.LOAD && valid(deqPtr.value))
+  io.lsq.pendingld := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.LOAD && valid(deqPtr.value) && mmio(deqPtr.value))
  io.lsq.pendingst := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.STORE && valid(deqPtr.value))
  io.lsq.commit := RegNext(io.commits.isCommit && io.commits.commitValid(0))
+  io.lsq.pendingPtr := RegNext(deqPtr)

  /**
    * state changes
@@ -867,6 +920,10 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
    debug_lsInfo(io.debug_ls.debugLsInfo(i).s1_robIdx).s1SignalEnable(io.debug_ls.debugLsInfo(i))
    debug_lsInfo(io.debug_ls.debugLsInfo(i).s2_robIdx).s2SignalEnable(io.debug_ls.debugLsInfo(i))
  }
+  for (i <- 0 until exuParameters.LduCnt) {
+    debug_lsTopdownInfo(io.lsTopdownInfo(i).s1.robIdx).s1SignalEnable(io.lsTopdownInfo(i))
+    debug_lsTopdownInfo(io.lsTopdownInfo(i).s2.robIdx).s2SignalEnable(io.lsTopdownInfo(i))
+  }

  // status field: writebacked
  // enqueue logic set 6 writebacked to false
@@ -1007,6 +1064,7 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
  instrCntReg := instrCnt
  io.csr.perfinfo.retiredInstr := retireCounter
  io.robFull := !allowEnqueue
+  io.headNotReady := commit_v.head && !commit_w.head

  /**
    * debug info
@@ -1097,18 +1155,36 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
    }
  }

+  val sourceVaddr = Wire(Valid(UInt(VAddrBits.W)))
+  sourceVaddr.valid := debug_lsTopdownInfo(deqPtr.value).s1.vaddr_valid
+  sourceVaddr.bits  := debug_lsTopdownInfo(deqPtr.value).s1.vaddr_bits
+  val sourcePaddr = Wire(Valid(UInt(PAddrBits.W)))
+  sourcePaddr.valid := debug_lsTopdownInfo(deqPtr.value).s2.paddr_valid
+  sourcePaddr.bits  := debug_lsTopdownInfo(deqPtr.value).s2.paddr_bits
+  val sourceLqIdx = Wire(Valid(new LqPtr))
+  sourceLqIdx.valid := debug_lqIdxValid(deqPtr.value)
+  sourceLqIdx.bits  := debug_microOp(deqPtr.value).lqIdx
+  val sourceHeadLsIssue = WireDefault(debug_lsIssue(deqPtr.value))
+  ExcitingUtils.addSource(sourceVaddr, s"rob_head_vaddr_${coreParams.HartId}", ExcitingUtils.Perf, true)
+  ExcitingUtils.addSource(sourcePaddr, s"rob_head_paddr_${coreParams.HartId}", ExcitingUtils.Perf, true)
+  ExcitingUtils.addSource(sourceLqIdx, s"rob_head_lqIdx_${coreParams.HartId}", ExcitingUtils.Perf, true)
+  ExcitingUtils.addSource(sourceHeadLsIssue, s"rob_head_ls_issue_${coreParams.HartId}", ExcitingUtils.Perf, true)
+  // dummy sink
+  ExcitingUtils.addSink(WireDefault(sourceLqIdx), s"rob_head_lqIdx_${coreParams.HartId}", ExcitingUtils.Perf)
+
  /**
    * DataBase info:
    * log trigger is at writeback valid
    * */
  if(!env.FPGAPlatform){
-    val instTableName = "InstDB" + p(XSCoreParamsKey).HartId.toString
+    val isWriteInstInfoTable = WireInit(Constantin.createRecord("isWriteInstInfoTable" + p(XSCoreParamsKey).HartId.toString))
+    val instTableName = "InstTable" + p(XSCoreParamsKey).HartId.toString
    val instSiteName = "Rob" + p(XSCoreParamsKey).HartId.toString
-    val debug_instTable = ChiselDB.createTable(instTableName, new DebugInstDB)
+    val debug_instTable = ChiselDB.createTable(instTableName, new InstInfoEntry)
    // FIXME lyq: only get inst (alu, bj, ls) in exuWriteback
    for (wb <- exuWriteback) {
      when(wb.valid) {
-        val debug_instData = Wire(new DebugInstDB)
+        val debug_instData = Wire(new InstInfoEntry)
        val idx = wb.bits.uop.robIdx.value
        debug_instData.globalID := wb.bits.uop.ctrl.debug_globalID
        debug_instData.robIdx := idx
@@ -1120,10 +1196,12 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
        debug_instData.accessLatency := wb.bits.uop.debugInfo.writebackTime - wb.bits.uop.debugInfo.issueTime
        debug_instData.executeLatency := wb.bits.uop.debugInfo.writebackTime - wb.bits.uop.debugInfo.issueTime
        debug_instData.issueLatency := wb.bits.uop.debugInfo.issueTime - wb.bits.uop.debugInfo.selectTime
-        debug_instData.exceptType := wb.bits.uop.cf.exceptionVec
+        debug_instData.exceptType := Cat(wb.bits.uop.cf.exceptionVec)
        debug_instData.lsInfo := debug_lsInfo(idx)
        debug_instData.mdpInfo.ssid := wb.bits.uop.cf.ssid
        debug_instData.mdpInfo.waitAllStore := wb.bits.uop.cf.loadWaitStrict && wb.bits.uop.cf.loadWaitBit
+        debug_instData.issueTime := wb.bits.uop.debugInfo.issueTime
+        debug_instData.writebackTime := wb.bits.uop.debugInfo.writebackTime
        debug_instTable.log(
          data = debug_instData,
          en = wb.valid,

--- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
@@ -27,7 +27,8 @@ import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, Trans
 import freechips.rocketchip.tilelink._
 import freechips.rocketchip.util.{BundleFieldBase, UIntToOH1}
 import device.RAMHelper
-import huancun.{AliasField, AliasKey, DirtyField, PreferCacheField, PrefetchField}
+import coupledL2.{AliasField, AliasKey, DirtyField, PrefetchField}
+import utility.ReqSourceField
 import utility.FastArbiter
 import mem.{AddPipelineReg}
 import xiangshan.cache.dcache.ReplayCarry
@@ -50,7 +51,7 @@ case class DCacheParameters
  nMMIOEntries: Int = 1,
  nMMIOs: Int = 1,
  blockBytes: Int = 64,
-  alwaysReleaseData: Boolean = true
+  alwaysReleaseData: Boolean = false
 ) extends L1CacheParameters {
  // if sets * blockBytes > 4KB(page size),
  // cache alias will happen,
@@ -59,9 +60,9 @@ case class DCacheParameters
  val aliasBitsOpt = if(setBytes > pageSize) Some(log2Ceil(setBytes / pageSize)) else None
  val reqFields: Seq[BundleFieldBase] = Seq(
    PrefetchField(),
-    PreferCacheField()
+    ReqSourceField()
  ) ++ aliasBitsOpt.map(AliasField)
-  val echoFields: Seq[BundleFieldBase] = Seq(DirtyField())
+  val echoFields: Seq[BundleFieldBase] = Nil

  def tagCode: Code = Code.fromString(tagECC)

@@ -152,7 +153,7 @@ trait HasDCacheParameters extends HasL1CacheParameters {
  val DCacheLineOffset = DCacheSetOffset

  // uncache
-  val uncacheIdxBits = log2Up(StoreQueueSize) max log2Up(LoadQueueSize)
+  val uncacheIdxBits = log2Up(StoreQueueSize + 1) max log2Up(VirtualLoadQueueSize + 1)
  // hardware prefetch parameters
  // high confidence hardware prefetch port
  val HighConfHWPFLoadPort = LoadPipelineWidth - 1 // use the last load port by default
@@ -299,18 +300,21 @@ class DCacheExtraMeta(implicit p: Parameters) extends DCacheBundle
 }

 // memory request in word granularity(load, mmio, lr/sc, atomics)
-class DCacheWordReq(implicit p: Parameters)  extends DCacheBundle
+class DCacheWordReq(implicit p: Parameters) extends DCacheBundle
 {
  val cmd    = UInt(M_SZ.W)
-  val addr   = UInt(PAddrBits.W)
+  val vaddr  = UInt(VAddrBits.W)
  val data   = UInt(DataBits.W)
  val mask   = UInt((DataBits/8).W)
  val id     = UInt(reqIdWidth.W)
  val instrtype   = UInt(sourceTypeWidth.W)
+  val isFirstIssue = Bool()
  val replayCarry = new ReplayCarry
+
+  val debug_robIdx = UInt(log2Ceil(RobSize).W)
  def dump() = {
-    XSDebug("DCacheWordReq: cmd: %x addr: %x data: %x mask: %x id: %d\n",
-      cmd, addr, data, mask, id)
+    XSDebug("DCacheWordReq: cmd: %x vaddr: %x data: %x mask: %x id: %d\n",
+      cmd, vaddr, data, mask, id)
  }
 }

@@ -331,7 +335,7 @@ class DCacheLineReq(implicit p: Parameters)  extends DCacheBundle
 }

 class DCacheWordReqWithVaddr(implicit p: Parameters) extends DCacheWordReq {
-  val vaddr = UInt(VAddrBits.W)
+  val addr = UInt(PAddrBits.W)
  val wline = Bool()
 }

@@ -342,7 +346,6 @@ class BaseDCacheWordResp(implicit p: Parameters) extends DCacheBundle
  // select in s3
  val data_delayed = UInt(DataBits.W)
  val id     = UInt(reqIdWidth.W)
-
  // cache req missed, send it to miss queue
  val miss   = Bool()
  // cache miss, and failed to enter the missqueue, replay from RS is needed
@@ -352,6 +355,7 @@ class BaseDCacheWordResp(implicit p: Parameters) extends DCacheBundle
  val tag_error = Bool() // tag error
  val mshr_id = UInt(log2Up(cfg.nMissEntries).W)

+  val debug_robIdx = UInt(log2Ceil(RobSize).W)
  def dump() = {
    XSDebug("DCacheWordResp: data: %x id: %d miss: %b replay: %b\n",
      data, id, miss, replay)
@@ -362,8 +366,11 @@ class DCacheWordResp(implicit p: Parameters) extends BaseDCacheWordResp
 {
  val meta_prefetch = Bool()
  val meta_access = Bool()
-  // 1 cycle after data resp
+  // s2
+  val handled = Bool()
+  // s3: 1 cycle after data resp
  val error_delayed = Bool() // all kinds of errors, include tag error
+  val replacementUpdated = Bool()
 }

 class BankedDCacheWordResp(implicit p: Parameters) extends DCacheWordResp
@@ -430,6 +437,7 @@ class UncacheWordReq(implicit p: Parameters) extends DCacheBundle
  val id   = UInt(uncacheIdxBits.W)
  val instrtype = UInt(sourceTypeWidth.W)
  val atomic = Bool()
+  val isFirstIssue = Bool()
  val replayCarry = new ReplayCarry

  def dump() = {
@@ -450,6 +458,7 @@ class UncacheWorResp(implicit p: Parameters) extends DCacheBundle
  val replayCarry = new ReplayCarry
  val mshr_id = UInt(log2Up(cfg.nMissEntries).W)  // FIXME: why uncacheWordResp is not merged to baseDcacheResp

+  val debug_robIdx = UInt(log2Ceil(RobSize).W)
  def dump() = {
    XSDebug("UncacheWordResp: data: %x id: %d miss: %b replay: %b, tag_error: %b, error: %b\n",
      data, id, miss, replay, tag_error, error) 
@@ -488,14 +497,17 @@ class DCacheLoadIO(implicit p: Parameters) extends DCacheWordIO
  val s1_kill  = Output(Bool())
  val s2_kill  = Output(Bool())
  val s2_pc = Output(UInt(VAddrBits.W))
+  // cycle 0: load has updated replacement before
+  val replacementUpdated = Output(Bool())
  // cycle 0: virtual address: req.addr
  // cycle 1: physical address: s1_paddr
  val s1_paddr_dup_lsu = Output(UInt(PAddrBits.W)) // lsu side paddr
  val s1_paddr_dup_dcache = Output(UInt(PAddrBits.W)) // dcache side paddr
  val s1_disable_fast_wakeup = Input(Bool())
-  val s1_bank_conflict = Input(Bool())
  // cycle 2: hit signal
-  val s2_hit = Input(Bool()) // hit signal for lsu, 
+  val s2_hit = Input(Bool()) // hit signal for lsu,
+  val s2_first_hit = Input(Bool())
+  val s2_bank_conflict = Input(Bool())

  // debug
  val debug_s1_hit_way = Input(UInt(nWays.W))
@@ -691,7 +703,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame

  //----------------------------------------
  // core data structures
-  val bankedDataArray = Module(new BankedDataArray)
+  val bankedDataArray = if(EnableDCacheWPU) Module(new SramedDataArray) else Module(new BankedDataArray)
  val metaArray = Module(new L1CohMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
  val errorArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
  val prefetchArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2)) // prefetch flag array
@@ -791,6 +803,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame

  //----------------------------------------
  // data array
+  mainPipe.io.data_read.zip(ldu).map(x => x._1 := x._2.io.lsu.req.valid)

  val dataWriteArb = Module(new Arbiter(new L1BankedDataWriteReq, 2))
  dataWriteArb.io.in(0) <> refillPipe.io.data_write
@@ -808,7 +821,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
    bankedDataArray.io.write_dup(bank) <> dataWriteArb_dup.io.out
  }

-  bankedDataArray.io.readline <> mainPipe.io.data_read
+  bankedDataArray.io.readline <> mainPipe.io.data_readline
  bankedDataArray.io.readline_intend := mainPipe.io.data_read_intend
  mainPipe.io.readline_error_delayed := bankedDataArray.io.readline_error_delayed
  mainPipe.io.data_resp := bankedDataArray.io.readline_resp
@@ -847,6 +860,35 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
      bankedDataArray.io.disable_ld_fast_wakeup(w) // load pipe fast wake up should be disabled when bank conflict
  }

+  /** LoadMissDB: record load miss state */
+  val isWriteLoadMissTable = WireInit(Constantin.createRecord("isWriteLoadMissTable" + p(XSCoreParamsKey).HartId.toString))
+  val isFirstHitWrite = WireInit(Constantin.createRecord("isFirstHitWrite" + p(XSCoreParamsKey).HartId.toString))
+  val tableName = "LoadMissDB" + p(XSCoreParamsKey).HartId.toString
+  val siteName = "DcacheWrapper" + p(XSCoreParamsKey).HartId.toString
+  val loadMissTable = ChiselDB.createTable(tableName, new LoadMissEntry)
+  for( i <- 0 until LoadPipelineWidth){
+    val loadMissEntry = Wire(new LoadMissEntry)
+    val loadMissWriteEn =
+      (!ldu(i).io.lsu.resp.bits.replay && ldu(i).io.miss_req.fire) ||
+      (ldu(i).io.lsu.s2_first_hit && ldu(i).io.lsu.resp.valid && isFirstHitWrite.orR)
+    loadMissEntry.timeCnt := GTimer()
+    loadMissEntry.robIdx := ldu(i).io.lsu.resp.bits.debug_robIdx
+    loadMissEntry.paddr := ldu(i).io.miss_req.bits.addr
+    loadMissEntry.vaddr := ldu(i).io.miss_req.bits.vaddr
+    loadMissEntry.missState := OHToUInt(Cat(Seq(
+      ldu(i).io.miss_req.fire & ldu(i).io.miss_resp.merged,
+      ldu(i).io.miss_req.fire & !ldu(i).io.miss_resp.merged,
+      ldu(i).io.lsu.s2_first_hit && ldu(i).io.lsu.resp.valid
+    )))
+    loadMissTable.log(
+      data = loadMissEntry,
+      en = isWriteLoadMissTable.orR && loadMissWriteEn,
+      site = siteName,
+      clock = clock,
+      reset = reset
+    )
+  }
+
  //----------------------------------------
  // atomics
  // atomics not finished yet
@@ -1104,7 +1146,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
  ld_access.zip(ldu).foreach {
    case (a, u) =>
      a.valid := RegNext(u.io.lsu.req.fire()) && !u.io.lsu.s1_kill
-      a.bits.idx := RegNext(get_idx(u.io.lsu.req.bits.addr))
+      a.bits.idx := RegNext(get_idx(u.io.lsu.req.bits.vaddr))
      a.bits.tag := get_tag(u.io.lsu.s1_paddr_dup_dcache)
  }
  st_access.valid := RegNext(mainPipe.io.store_req.fire())

--- a/src/main/scala/xiangshan/cache/dcache/data/BankedDataArray.scala
+++ b/src/main/scala/xiangshan/cache/dcache/data/BankedDataArray.scala
--- a/src/main/scala/xiangshan/cache/dcache/loadpipe/LoadPipe.scala
+++ b/src/main/scala/xiangshan/cache/dcache/loadpipe/LoadPipe.scala
@@ -21,6 +21,7 @@ import chisel3._
 import chisel3.util._
 import freechips.rocketchip.tilelink.ClientMetadata
 import utils.{HasPerfEvents, XSDebug, XSPerfAccumulate}
+import utility.ParallelPriorityMux
 import xiangshan.L1CacheErrorInfo
 import xiangshan.cache.dcache.{DCacheWPU, IdealWPU}

@@ -87,11 +88,11 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  val tag_read = io.tag_read.bits

  // Tag read for new requests
-  meta_read.idx := get_idx(io.lsu.req.bits.addr)
+  meta_read.idx := get_idx(io.lsu.req.bits.vaddr)
  meta_read.way_en := ~0.U(nWays.W)
  // meta_read.tag := DontCare

-  tag_read.idx := get_idx(io.lsu.req.bits.addr)
+  tag_read.idx := get_idx(io.lsu.req.bits.vaddr)
  tag_read.way_en := ~0.U(nWays.W)

  // Pipeline
@@ -103,7 +104,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  val s0_valid = io.lsu.req.fire()
  val s0_req = io.lsu.req.bits
  val s0_fire = s0_valid && s1_ready
-  val s0_vaddr = s0_req.addr
+  val s0_vaddr = s0_req.vaddr
  val s0_replayCarry = s0_req.replayCarry
  assert(RegNext(!(s0_valid && (s0_req.cmd =/= MemoryOpConstants.M_XRD && s0_req.cmd =/= MemoryOpConstants.M_PFR && s0_req.cmd =/= MemoryOpConstants.M_PFW))), "LoadPipe only accepts load req / softprefetch read or write!")
  dump_pipeline_reqs("LoadPipe s0", s0_valid, s0_req)
@@ -119,7 +120,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  val s1_paddr_dup_lsu = io.lsu.s1_paddr_dup_lsu
  val s1_paddr_dup_dcache = io.lsu.s1_paddr_dup_dcache
  // LSU may update the address from io.lsu.s1_paddr, which affects the bank read enable only.
-  val s1_vaddr = Cat(s1_req.addr(PAddrBits - 1, blockOffBits), io.lsu.s1_paddr_dup_lsu(blockOffBits - 1, 0))
+  val s1_vaddr = Cat(s1_req.vaddr(VAddrBits - 1, blockOffBits), io.lsu.s1_paddr_dup_lsu(blockOffBits - 1, 0))
  val s1_bank_oh = UIntToOH(addr_to_dcache_bank(s1_vaddr))
  val s1_nack = RegNext(io.nack)
  val s1_nack_data = !io.banked_data_read.ready
@@ -195,16 +196,24 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer

  io.replace_way.set.valid := RegNext(s0_fire)
  io.replace_way.set.bits := get_idx(s1_vaddr)
-  val s1_repl_way_en = UIntToOH(io.replace_way.way)
-  val s1_repl_tag = Mux1H(s1_repl_way_en, wayMap(w => tag_resp(w)))
-  val s1_repl_coh = Mux1H(s1_repl_way_en, wayMap(w => meta_resp(w).coh))
-  val s1_repl_extra_meta = Mux1H(s1_repl_way_en, wayMap(w => io.extra_meta_resp(w)))
+
+  val s1_invalid_vec = wayMap(w => !meta_resp(w).coh.isValid())
+  val s1_have_invalid_way = s1_invalid_vec.asUInt.orR
+  val s1_invalid_way_en = ParallelPriorityMux(s1_invalid_vec.zipWithIndex.map(x => x._1 -> UIntToOH(x._2.U(nWays.W))))
+  val s1_repl_way_en_oh = Mux(s1_have_invalid_way, s1_invalid_way_en, UIntToOH(io.replace_way.way))
+  val s1_repl_way_en_enc = OHToUInt(s1_repl_way_en_oh)
+  val s1_repl_tag = Mux1H(s1_repl_way_en_oh, wayMap(w => tag_resp(w)))
+  val s1_repl_coh = Mux1H(s1_repl_way_en_oh, wayMap(w => meta_resp(w).coh))
+  val s1_repl_extra_meta = Mux1H(s1_repl_way_en_oh, wayMap(w => io.extra_meta_resp(w)))

  val s1_need_replacement = !s1_tag_match_dup_dc
-  val s1_way_en = Mux(s1_need_replacement, s1_repl_way_en, s1_tag_match_way_dup_dc)
+  val s1_way_en = Mux(s1_need_replacement, s1_repl_way_en_oh, s1_tag_match_way_dup_dc)
  val s1_coh = Mux(s1_need_replacement, s1_repl_coh, s1_hit_coh)
  val s1_tag = Mux(s1_need_replacement, s1_repl_tag, get_tag(s1_paddr_dup_dcache))

+  XSPerfAccumulate("load_has_invalid_way_but_select_valid_way", io.replace_way.set.valid && wayMap(w => !meta_resp(w).coh.isValid()).asUInt.orR && s1_need_replacement && s1_repl_coh.isValid())
+  XSPerfAccumulate("load_using_replacement", io.replace_way.set.valid && s1_need_replacement)
+
  // data read
  io.banked_data_read.valid := s1_fire && !s1_nack
  io.banked_data_read.bits.addr := s1_vaddr
@@ -272,7 +281,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  val s2_nack_data = RegEnable(!io.banked_data_read.ready, s1_fire)
  val s2_nack = s2_nack_hit || s2_nack_no_mshr || s2_nack_data
  // s2 miss merged
-  val s2_miss_merged = io.miss_req.valid && io.miss_resp.merged
+  val s2_miss_merged = io.miss_req.fire && !io.miss_req.bits.cancel && io.miss_resp.merged

  val s2_bank_addr = addr_to_dcache_bank(s2_paddr)
  dontTouch(s2_bank_addr)
@@ -329,6 +338,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  }
  // io.debug_s2_cache_miss := real_miss
  resp.bits.miss := real_miss || io.bank_conflict_slow || s2_wpu_pred_fail
+  io.lsu.s2_first_hit := s2_req.isFirstIssue && s2_hit
  // load pipe need replay when there is a bank conflict or wpu predict fail
  resp.bits.replay := (resp.bits.miss && (!io.miss_req.fire() || s2_nack)) || io.bank_conflict_slow || s2_wpu_pred_fail
  resp.bits.replayCarry.valid := resp.bits.miss
@@ -337,6 +347,8 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  resp.bits.meta_access := s2_hit_access
  resp.bits.tag_error := s2_tag_error // report tag_error in load s2
  resp.bits.mshr_id := io.miss_resp.id
+  resp.bits.handled := io.miss_req.fire && !io.miss_req.bits.cancel && io.miss_resp.handled
+  resp.bits.debug_robIdx := s2_req.debug_robIdx

  XSPerfAccumulate("wpu_pred_fail", s2_wpu_pred_fail && s2_valid)
  XSPerfAccumulate("dcache_read_bank_conflict", io.bank_conflict_slow && s2_valid)
@@ -353,7 +365,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer

  io.lsu.debug_s1_hit_way := s1_tag_match_way_dup_dc
  io.lsu.s1_disable_fast_wakeup := io.disable_ld_fast_wakeup
-  io.lsu.s1_bank_conflict := io.bank_conflict_fast
+  io.lsu.s2_bank_conflict := io.bank_conflict_slow
  assert(RegNext(s1_ready && s2_ready), "load pipeline should never be blocked")

  // --------------------------------------------------------------------------------
@@ -376,6 +388,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  // error_delayed signal will be used to update uop.exception 1 cycle after load writeback
  resp.bits.error_delayed := s3_error && (s3_hit || s3_tag_error) && s3_valid
  resp.bits.data_delayed := s3_banked_data_resp_word
+  resp.bits.replacementUpdated := io.replace_access.valid
  
  // report tag / data / l2 error (with paddr) to bus error unit
  io.error := 0.U.asTypeOf(new L1CacheErrorInfo())
@@ -389,30 +402,39 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  io.error.valid := s3_error && s3_valid

  // update plru in s3
+  val s3_miss_merged = RegNext(s2_miss_merged)
+  val first_update = RegNext(RegNext(RegNext(!io.lsu.replacementUpdated)))
+  val hit_update_replace_en  = RegNext(s2_valid) && RegNext(!resp.bits.miss)
+  val miss_update_replace_en = RegNext(io.miss_req.fire) && RegNext(!io.miss_req.bits.cancel) && RegNext(io.miss_resp.handled)
+
  if (!cfg.updateReplaceOn2ndmiss) {
    // replacement is only updated on 1st miss
-    io.replace_access.valid := RegNext(RegNext(
-      RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) && 
-      !s2_nack_no_mshr &&
-      !s2_miss_merged
-    )
-    io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.addr)))
-    io.replace_access.bits.way := RegNext(RegNext(Mux(s1_tag_match_dup_dc, OHToUInt(s1_tag_match_way_dup_dc), io.replace_way.way)))
+    // io.replace_access.valid := RegNext(RegNext(
+    //   RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) && 
+    //   !s2_nack_no_mshr &&
+    //   !s2_miss_merged
+    // )
+    io.replace_access.valid := (hit_update_replace_en || (miss_update_replace_en && !s3_miss_merged)) && first_update
+    io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.vaddr)))
+    io.replace_access.bits.way := RegNext(RegNext(Mux(s1_tag_match_dup_dc, OHToUInt(s1_tag_match_way_dup_dc), s1_repl_way_en_enc)))
  } else {
    // replacement is updated on both 1st and 2nd miss
    // timing is worse than !cfg.updateReplaceOn2ndmiss
-    io.replace_access.valid := RegNext(RegNext(
-      RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) &&
-      !s2_nack_no_mshr
-    )
-    io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.addr)))
+    // io.replace_access.valid := RegNext(RegNext(
+    //   RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) &&
+    //   !s2_nack_no_mshr &&
+    //   // replacement is updated on 2nd miss only when this req is firstly issued
+    //   (!s2_miss_merged || s2_req.isFirstIssue)
+    // )
+    io.replace_access.valid := (hit_update_replace_en || miss_update_replace_en) && first_update
+    io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.vaddr)))
    io.replace_access.bits.way := RegNext(
      Mux(
        RegNext(s1_tag_match_dup_dc),
        RegNext(OHToUInt(s1_tag_match_way_dup_dc)), // if hit, access hit way in plru
        Mux( // if miss
          !s2_miss_merged,
-          RegNext(io.replace_way.way), // 1st fire: access new selected replace way
+          RegNext(s1_repl_way_en_enc), // 1st fire: access new selected replace way
          OHToUInt(io.miss_resp.repl_way_en) // 2nd fire: access replace way selected at miss queue allocate time
        )
      )

--- a/src/main/scala/xiangshan/cache/dcache/mainpipe/MainPipe.scala
+++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/MainPipe.scala
@@ -25,7 +25,7 @@ import freechips.rocketchip.tilelink.TLPermissions._
 import freechips.rocketchip.tilelink.{ClientMetadata, ClientStates, TLPermissions}
 import utils._
 import utility._
-import xiangshan.L1CacheErrorInfo
+import xiangshan.{L1CacheErrorInfo, XSCoreParamsKey}

 class MainPipeReq(implicit p: Parameters) extends DCacheBundle {
  val miss = Bool() // only amo miss will refill in main pipe
@@ -121,8 +121,9 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
    val probe_ttob_check_resp = Flipped(ValidIO(new ProbeToBCheckResp))

    // data sram
+    val data_read = Vec(LoadPipelineWidth, Input(Bool()))
    val data_read_intend = Output(Bool())
-    val data_read = DecoupledIO(new L1BankedDataReadLineReq)
+    val data_readline = DecoupledIO(new L1BankedDataReadLineReq)
    val data_resp = Input(Vec(DCacheBanks, new L1BankedDataReadResult()))
    val readline_error_delayed = Input(Bool())
    val data_write = DecoupledIO(new L1BankedDataWriteReq)
@@ -181,10 +182,22 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
  val s1_ready, s2_ready, s3_ready = Wire(Bool())

  // convert store req to main pipe req, and select a req from store and probe
+  val storeWaitCycles = RegInit(0.U(4.W))
+  val StoreWaitThreshold = WireInit(12.U(4.W))
+  val storeWaitTooLong = storeWaitCycles >= StoreWaitThreshold
+  val loadsAreComing = io.data_read.asUInt.orR
+  val storeCanAccept = storeWaitTooLong || !loadsAreComing
+
  val store_req = Wire(DecoupledIO(new MainPipeReq))
  store_req.bits := (new MainPipeReq).convertStoreReq(io.store_req.bits)
-  store_req.valid := io.store_req.valid
-  io.store_req.ready := store_req.ready
+  store_req.valid := io.store_req.valid && storeCanAccept
+  io.store_req.ready := store_req.ready && storeCanAccept
+
+  when (store_req.fire) { // if wait too long and write success, reset counter.
+    storeWaitCycles := 0.U
+  } .elsewhen (storeWaitCycles < StoreWaitThreshold && store_req.valid && !store_req.ready) { // if block store, increase counter.
+    storeWaitCycles := storeWaitCycles + 1.U
+  }

  // s0: read meta and tag
  val req = Wire(DecoupledIO(new MainPipeReq))
@@ -244,7 +257,7 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
  val s1_banked_rmask = RegEnable(s0_banked_rmask, s0_fire)
  val s1_banked_store_wmask = RegEnable(banked_store_wmask, s0_fire)
  val s1_need_tag = RegEnable(s0_need_tag, s0_fire)
-  val s1_can_go = s2_ready && (io.data_read.ready || !s1_need_data)
+  val s1_can_go = s2_ready && (io.data_readline.ready || !s1_need_data)
  val s1_fire = s1_valid && s1_can_go
  val s1_idx = get_idx(s1_req.vaddr)

@@ -293,8 +306,19 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
  XSPerfAccumulate("replace_unused_prefetch", s1_req.replace && s1_extra_meta.prefetch && !s1_extra_meta.access) // may not be accurate

  // replacement policy
+  val s1_invalid_vec = wayMap(w => !meta_resp(w).asTypeOf(new Meta).coh.isValid())
+  val s1_have_invalid_way = s1_invalid_vec.asUInt.orR
+  val s1_invalid_way_en = ParallelPriorityMux(s1_invalid_vec.zipWithIndex.map(x => x._1 -> UIntToOH(x._2.U(nWays.W))))
  val s1_repl_way_en = WireInit(0.U(nWays.W))
-  s1_repl_way_en := Mux(RegNext(s0_fire), UIntToOH(io.replace_way.way), RegNext(s1_repl_way_en))
+  s1_repl_way_en := Mux(
+    RegNext(s0_fire),
+    Mux(
+      s1_have_invalid_way,
+      s1_invalid_way_en,
+      UIntToOH(io.replace_way.way)
+      ),
+    RegNext(s1_repl_way_en)
+  )
  val s1_repl_tag = Mux1H(s1_repl_way_en, wayMap(w => tag_resp(w)))
  val s1_repl_coh = Mux1H(s1_repl_way_en, wayMap(w => meta_resp(w))).asTypeOf(new ClientMetadata)
  val s1_miss_tag = Mux1H(s1_req.miss_way_en, wayMap(w => tag_resp(w)))
@@ -337,6 +361,9 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
    )
  )

+  XSPerfAccumulate("store_has_invalid_way_but_select_valid_way", io.replace_way.set.valid && wayMap(w => !meta_resp(w).asTypeOf(new Meta).coh.isValid()).asUInt.orR && s1_need_replacement && s1_repl_coh.isValid())
+  XSPerfAccumulate("store_using_replacement", io.replace_way.set.valid && s1_need_replacement)
+
  val s1_has_permission = s1_hit_coh.onAccess(s1_req.cmd)._1
  val s1_hit = s1_tag_match && s1_has_permission
  val s1_pregen_can_go_to_mq = !s1_req.replace && !s1_req.probe && !s1_req.miss && (s1_req.isStore || s1_req.isAMO) && !s1_hit
@@ -1396,10 +1423,10 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
  io.tag_read.bits.way_en := ~0.U(nWays.W)

  io.data_read_intend := s1_valid_dup(3) && s1_need_data
-  io.data_read.valid := s1_valid_dup(4) && s1_need_data
-  io.data_read.bits.rmask := s1_banked_rmask
-  io.data_read.bits.way_en := s1_way_en
-  io.data_read.bits.addr := s1_req_vaddr_dup_for_data_read
+  io.data_readline.valid := s1_valid_dup(4) && s1_need_data
+  io.data_readline.bits.rmask := s1_banked_rmask
+  io.data_readline.bits.way_en := s1_way_en
+  io.data_readline.bits.addr := s1_req_vaddr_dup_for_data_read

  io.miss_req.valid := s2_valid_dup(4) && s2_can_go_to_mq_dup(0)
  val miss_req = io.miss_req.bits

--- a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala
+++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala
@@ -27,8 +27,7 @@ import freechips.rocketchip.tilelink.ClientStates._
 import freechips.rocketchip.tilelink.MemoryOpCategories._
 import freechips.rocketchip.tilelink.TLPermissions._
 import difftest._
-import huancun.{AliasKey, DirtyKey, PreferCacheKey, PrefetchKey}
-import utility.FastArbiter
+import coupledL2.{AliasKey, DirtyKey, PrefetchKey}
 import mem.{AddPipelineReg}
 import mem.trace._

@@ -116,6 +115,8 @@ class MissReq(implicit p: Parameters) extends MissReqWoStoreData {

 class MissResp(implicit p: Parameters) extends DCacheBundle {
  val id = UInt(log2Up(cfg.nMissEntries).W)
+  // cache miss request is handled by miss queue, either merged or newly allocated
+  val handled = Bool()
  // cache req missed, merged into one of miss queue entries
  // i.e. !miss_merged means this access is the first miss for this cacheline
  val merged = Bool()
@@ -251,11 +252,34 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {

    // whether the pipeline reg has send out an acquire
    val acquire_fired_by_pipe_reg = Input(Bool())
+
+    val perf_pending_prefetch = Output(Bool())
+    val perf_pending_normal   = Output(Bool())
+
+    val rob_head_query = new DCacheBundle {
+      val vaddr = Input(UInt(VAddrBits.W))
+      val query_valid = Input(Bool())
+
+      val resp = Output(Bool())
+
+      def hit(e_vaddr: UInt): Bool = {
+        require(e_vaddr.getWidth == VAddrBits)
+        query_valid && vaddr(VAddrBits - 1, DCacheLineOffset) === e_vaddr(VAddrBits - 1, DCacheLineOffset)
+      }
+    }
+
+    val latency_monitor = new DCacheBundle {
+      val load_miss_refilling  = Output(Bool())
+      val store_miss_refilling = Output(Bool())
+      val amo_miss_refilling   = Output(Bool())
+      val pf_miss_refilling    = Output(Bool())
+    }
  })

  assert(!RegNext(io.primary_valid && !io.primary_ready))

  val req = Reg(new MissReqWoStoreData)
+  val req_primary_fire = Reg(new MissReqWoStoreData) // for perf use
  val req_store_mask = Reg(UInt(cfg.blockBytes.W))
  val req_valid = RegInit(false.B)
  val set = addr_to_dcache_set(req.vaddr)
@@ -308,6 +332,14 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {

  val req_handled_by_this_entry = primary_fire || secondary_fire

+  // for perf use
+  val secondary_fired = RegInit(false.B)
+
+  io.perf_pending_prefetch := req_valid && prefetch && !secondary_fired
+  io.perf_pending_normal   := req_valid && (!prefetch || secondary_fired)
+
+  io.rob_head_query.resp   := io.rob_head_query.hit(req.vaddr) && req_valid
+
  io.req_handled_by_this_entry := req_handled_by_this_entry

  when (release_entry && req_valid) {
@@ -317,7 +349,9 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
  when (io.miss_req_pipe_reg.alloc) {
    assert(RegNext(primary_fire), "after 1 cycle of primary_fire, entry will be allocated")
    req_valid := true.B
+
    req := miss_req_pipe_reg_bits.toMissReqWoStoreData()
+    req_primary_fire := miss_req_pipe_reg_bits.toMissReqWoStoreData()
    req.addr := get_block_addr(miss_req_pipe_reg_bits.addr)

    s_acquire := io.acquire_fired_by_pipe_reg
@@ -353,6 +387,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
    error := false.B
    prefetch := input_req_is_prefetch
    access := false.B
+    secondary_fired := false.B
  }

  when (io.miss_req_pipe_reg.merge) {
@@ -382,6 +417,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
    when (!input_req_is_prefetch) {
      access := true.B // when merge non-prefetch req, set access bit
    }
+    secondary_fired := true.B
  }

  when (io.mem_acquire.fire()) {
@@ -544,8 +580,20 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
  io.mem_acquire.bits.user.lift(AliasKey).foreach( _ := req.vaddr(13, 12))
  // trigger prefetch
  io.mem_acquire.bits.user.lift(PrefetchKey).foreach(_ := Mux(io.l2_pf_store_only, req.isFromStore, true.B))
-  // prefer not to cache data in L2 by default
-  io.mem_acquire.bits.user.lift(PreferCacheKey).foreach(_ := false.B)
+  // req source
+  when(prefetch && !secondary_fired) {
+    io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.L1DataPrefetch.id.U)
+  }.otherwise {
+    when(req.isFromStore) {
+      io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.CPUStoreData.id.U)
+    }.elsewhen(req.isFromLoad) {
+      io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.CPULoadData.id.U)
+    }.elsewhen(req.isFromAMO) {
+      io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.CPUAtomicData.id.U)
+    }.otherwise {
+      io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.L1DataPrefetch.id.U)
+    }
+  }
  require(nSets <= 256)

  io.mem_grant.ready := !w_grantlast && s_acquire
@@ -632,6 +680,12 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {

  io.forwardInfo.apply(req_valid, req.addr, refill_data_raw, w_grantfirst, w_grantlast)

+  // refill latency monitor
+  io.latency_monitor.load_miss_refilling  := req_valid && req_primary_fire.isFromLoad     && BoolStopWatch(io.mem_acquire.fire, io.mem_grant.fire && !refill_done, true)
+  io.latency_monitor.store_miss_refilling := req_valid && req_primary_fire.isFromStore    && BoolStopWatch(io.mem_acquire.fire, io.mem_grant.fire && !refill_done, true)
+  io.latency_monitor.amo_miss_refilling   := req_valid && req_primary_fire.isFromAMO      && BoolStopWatch(io.mem_acquire.fire, io.mem_grant.fire && !refill_done, true)
+  io.latency_monitor.pf_miss_refilling    := req_valid && req_primary_fire.isFromPrefetch && BoolStopWatch(io.mem_acquire.fire, io.mem_grant.fire && !refill_done, true)
+
  XSPerfAccumulate("miss_req_primary", primary_fire)
  XSPerfAccumulate("miss_req_merged", secondary_fire)
  XSPerfAccumulate("load_miss_penalty_to_use",
@@ -725,6 +779,7 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
  assert(PopCount(Seq(req_pipeline_reg_handled, VecInit(req_mshr_handled_vec).asUInt.orR)) <= 1.U, "miss req will either go to mshr or pipeline reg")
  assert(PopCount(req_mshr_handled_vec) <= 1.U, "Only one mshr can handle a req")
  io.resp.id := Mux(!req_pipeline_reg_handled, OHToUInt(req_mshr_handled_vec), miss_req_pipe_reg.mshr_id)
+  io.resp.handled := Cat(req_mshr_handled_vec).orR || req_pipeline_reg_handled
  io.resp.merged := merge
  io.resp.repl_way_en := Mux(!req_pipeline_reg_handled, Mux1H(secondary_ready_vec, entries.map(_.io.repl_way_en)), miss_req_pipe_reg.req.way_en)

@@ -856,8 +911,9 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
  debug_miss_trace.source := io.req.bits.source
  debug_miss_trace.pc := io.req.bits.pc

+  val isWriteL1MissQMissTable = WireInit(Constantin.createRecord("isWriteL1MissQMissTable" + p(XSCoreParamsKey).HartId.toString))
  val table = ChiselDB.createTable("L1MissQMissTrace_hart"+ p(XSCoreParamsKey).HartId.toString, new L1MissTrace)
-  table.log(debug_miss_trace, io.req.valid && !io.req.bits.cancel && alloc, "MissQueue", clock, reset)
+  table.log(debug_miss_trace, isWriteL1MissQMissTable.orR && io.req.valid && !io.req.bits.cancel && alloc, "MissQueue", clock, reset)

  // Difftest
  if (env.EnableDifftest) {
@@ -871,13 +927,16 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
  }

  // Perf count
-  XSPerfAccumulate("miss_req", io.req.fire())
-  XSPerfAccumulate("miss_req_allocate", io.req.fire() && alloc)
-  XSPerfAccumulate("miss_req_merge_load", io.req.fire() && merge && io.req.bits.isFromLoad)
-  XSPerfAccumulate("miss_req_reject_load", io.req.valid && reject && io.req.bits.isFromLoad)
+  XSPerfAccumulate("miss_req", io.req.fire() && !io.req.bits.cancel)
+  XSPerfAccumulate("miss_req_allocate", io.req.fire() && !io.req.bits.cancel && alloc)
+  XSPerfAccumulate("miss_req_load_allocate", io.req.fire() && !io.req.bits.cancel && alloc && io.req.bits.isFromLoad)
+  XSPerfAccumulate("miss_req_store_allocate", io.req.fire() && !io.req.bits.cancel && alloc && io.req.bits.isFromStore)
+  XSPerfAccumulate("miss_req_amo_allocate", io.req.fire() && !io.req.bits.cancel && alloc && io.req.bits.isFromAMO)
+  XSPerfAccumulate("miss_req_merge_load", io.req.fire() && !io.req.bits.cancel && merge && io.req.bits.isFromLoad)
+  XSPerfAccumulate("miss_req_reject_load", io.req.valid && !io.req.bits.cancel && reject && io.req.bits.isFromLoad)
  XSPerfAccumulate("probe_blocked_by_miss", io.probe_block)
-  XSPerfAccumulate("prefetch_primary_fire", io.req.fire() && alloc && io.req.bits.isFromPrefetch)
-  XSPerfAccumulate("prefetch_secondary_fire", io.req.fire() && merge && io.req.bits.isFromPrefetch)
+  XSPerfAccumulate("prefetch_primary_fire", io.req.fire() && !io.req.bits.cancel && alloc && io.req.bits.isFromPrefetch)
+  XSPerfAccumulate("prefetch_secondary_fire", io.req.fire() && !io.req.bits.cancel && merge && io.req.bits.isFromPrefetch)
  val max_inflight = RegInit(0.U((log2Up(cfg.nMissEntries) + 1).W))
  val num_valids = PopCount(~Cat(primary_ready_vec).asUInt)
  when (num_valids > max_inflight) {
@@ -889,6 +948,32 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
  io.full := num_valids === cfg.nMissEntries.U
  XSPerfHistogram("num_valids", num_valids, true.B, 0, cfg.nMissEntries, 1)

+  XSPerfHistogram("L1DMLP_CPUData", PopCount(VecInit(entries.map(_.io.perf_pending_normal)).asUInt), true.B, 0, cfg.nMissEntries, 1)
+  XSPerfHistogram("L1DMLP_Prefetch", PopCount(VecInit(entries.map(_.io.perf_pending_prefetch)).asUInt), true.B, 0, cfg.nMissEntries, 1)
+  XSPerfHistogram("L1DMLP_Total", num_valids, true.B, 0, cfg.nMissEntries, 1)
+
+  XSPerfAccumulate("miss_load_refill_latency", PopCount(entries.map(_.io.latency_monitor.load_miss_refilling)))
+  XSPerfAccumulate("miss_store_refill_latency", PopCount(entries.map(_.io.latency_monitor.store_miss_refilling)))
+  XSPerfAccumulate("miss_amo_refill_latency", PopCount(entries.map(_.io.latency_monitor.amo_miss_refilling)))
+  XSPerfAccumulate("miss_pf_refill_latency", PopCount(entries.map(_.io.latency_monitor.pf_miss_refilling)))
+
+  val rob_head_miss_in_dcache = VecInit(entries.map(_.io.rob_head_query.resp)).asUInt.orR
+  val sourceVaddr = WireInit(0.U.asTypeOf(new Valid(UInt(VAddrBits.W))))
+  val lq_doing_other_replay = WireInit(false.B)
+
+  ExcitingUtils.addSink(sourceVaddr, s"rob_head_vaddr_${coreParams.HartId}", ExcitingUtils.Perf)
+  ExcitingUtils.addSink(lq_doing_other_replay, s"rob_head_other_replay_${coreParams.HartId}", ExcitingUtils.Perf)
+
+  entries.foreach {
+    case e => {
+      e.io.rob_head_query.query_valid := sourceVaddr.valid
+      e.io.rob_head_query.vaddr := sourceVaddr.bits
+    }
+  }
+
+  // ExcitingUtils.addSource(!rob_head_miss_in_dcache && !lq_doing_other_replay, s"load_l1_cache_stall_without_bank_conflict_${coreParams.HartId}", ExcitingUtils.Perf, true)
+  ExcitingUtils.addSource(rob_head_miss_in_dcache, s"load_l1_miss_${coreParams.HartId}", ExcitingUtils.Perf, true)
+
  val perfValidCount = RegNext(PopCount(entries.map(entry => (!entry.io.primary_ready))))
  val perfEvents = Seq(
    ("dcache_missq_req      ", io.req.fire()),

--- a/src/main/scala/xiangshan/cache/dcache/mainpipe/Probe.scala
+++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/Probe.scala
@@ -230,6 +230,7 @@ class ProbeQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule w
  when (io.lrsc_locked_block.valid) {
    XSDebug("lrsc_locked_block: %x\n", io.lrsc_locked_block.bits)
  }
+  XSPerfAccumulate("ProbeL1DCache", io.mem_probe.fire)

  val perfValidCount = RegNext(PopCount(entries.map(e => e.io.block_addr.valid)))
  val perfEvents = Seq(

--- a/src/main/scala/xiangshan/cache/dcache/mainpipe/RefillPipe.scala
+++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/RefillPipe.scala
@@ -81,9 +81,9 @@ class RefillPipe(implicit p: Parameters) extends DCacheModule {
  })

  // Assume that write in refill pipe is always ready
-  assert(RegNext(io.data_write.ready))
-  assert(RegNext(io.meta_write.ready))
-  assert(RegNext(io.tag_write.ready))
+  // assert(RegNext(io.data_write.ready))
+  // assert(RegNext(io.meta_write.ready))
+  // assert(RegNext(io.tag_write.ready))

  val refill_w_valid = io.req.valid
  val refill_w_req = io.req.bits

--- a/src/main/scala/xiangshan/cache/dcache/mainpipe/WritebackQueue.scala
+++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/WritebackQueue.scala
@@ -21,7 +21,7 @@ import chisel3._
 import chisel3.util._
 import freechips.rocketchip.tilelink.TLPermissions._
 import freechips.rocketchip.tilelink.{TLArbiter, TLBundleC, TLBundleD, TLEdgeOut}
-import huancun.DirtyKey
+import coupledL2.DirtyKey
 import utils.{HasPerfEvents, HasTLDump, XSDebug, XSPerfAccumulate}

 class WritebackReqCtrl(implicit p: Parameters) extends DCacheBundle {
@@ -210,7 +210,8 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
  // --------------------------------------------------------------------------------
  // s_invalid: receive requests
  // new req entering
-  when (io.req.valid && io.primary_valid && io.primary_ready) {
+  val alloc = io.req.valid && io.primary_valid && io.primary_ready
+  when (alloc) {
    assert (remain === 0.U)
    req := io.req.bits
    s_data_override := false.B
@@ -313,7 +314,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
    data = beat_data(beat)
  )._2

-  voluntaryReleaseData.echo.lift(DirtyKey).foreach(_ := req.dirty)
+  // voluntaryReleaseData.echo.lift(DirtyKey).foreach(_ := req.dirty)
  when(busy) {
    assert(!req.dirty || req.hasData)
  }
@@ -517,7 +518,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
    data := mergeData(data, io.release_update.bits.data_delayed, io.release_update.bits.mask_delayed)
  }

-  when (!s_data_override && req.hasData) {
+  when (!s_data_override && (req.hasData || RegNext(alloc))) {
    data := io.req_data.data
  }


--- a/src/main/scala/xiangshan/cache/dcache/meta/TagArray.scala
+++ b/src/main/scala/xiangshan/cache/dcache/meta/TagArray.scala
@@ -148,46 +148,38 @@ class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends DCacheM
  val rdataEcc_dup_vec = Seq(3, 4, 5)
  val wdata_dup_vec = Seq(6, 7, 8)
  val wdataEcc_dup_vec = Seq(9, 10, 11)
-  for(dupIdx <- rdata_dup_vec) {
-    for(idx <- 0 until readPorts){
-      when(io.cacheOp_req_dup(dupIdx).valid && isReadTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
-        array(idx).io.read.valid := true.B
-        array(idx).io.read.bits.idx := io.cacheOp.req.bits.index
-        array(idx).io.read.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
-        cacheOpShouldResp := true.B
-      }
+  rdata_dup_vec.zipWithIndex.map{ case(dupIdx, idx) =>
+    when(io.cacheOp_req_dup(dupIdx).valid && isReadTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
+      array(idx).io.read.valid := true.B
+      array(idx).io.read.bits.idx := io.cacheOp.req.bits.index
+      array(idx).io.read.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
+      cacheOpShouldResp := true.B
    }
  }
-  for (dupIdx <- rdataEcc_dup_vec) {
-    for (idx <- 0 until readPorts) {
-      when(io.cacheOp_req_dup(dupIdx).valid && isReadTagECC(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
-        array(idx).io.ecc_read.valid := true.B
-        array(idx).io.ecc_read.bits.idx := io.cacheOp.req.bits.index
-        array(idx).io.ecc_read.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
-        cacheOpShouldResp := true.B
-      }
+  rdataEcc_dup_vec.zipWithIndex.map{ case(dupIdx, idx) =>
+    when(io.cacheOp_req_dup(dupIdx).valid && isReadTagECC(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
+      array(idx).io.ecc_read.valid := true.B
+      array(idx).io.ecc_read.bits.idx := io.cacheOp.req.bits.index
+      array(idx).io.ecc_read.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
+      cacheOpShouldResp := true.B
    }
  }
-  for (dupIdx <- wdata_dup_vec) {
-    for (idx <- 0 until readPorts) {
-      when(io.cacheOp_req_dup(dupIdx).valid && isWriteTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
-        array(idx).io.write.valid := true.B
-        array(idx).io.write.bits.idx := io.cacheOp.req.bits.index
-        array(idx).io.write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
-        array(idx).io.write.bits.tag := io.cacheOp.req.bits.write_tag_low
-        cacheOpShouldResp := true.B
-      }
+  wdata_dup_vec.zipWithIndex.map{ case(dupIdx, idx) =>
+    when(io.cacheOp_req_dup(dupIdx).valid && isWriteTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
+      array(idx).io.write.valid := true.B
+      array(idx).io.write.bits.idx := io.cacheOp.req.bits.index
+      array(idx).io.write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
+      array(idx).io.write.bits.tag := io.cacheOp.req.bits.write_tag_low
+      cacheOpShouldResp := true.B
    }
  }
-  for (dupIdx <- wdataEcc_dup_vec) {
-    for (idx <- 0 until readPorts) {
-      when(io.cacheOp_req_dup(dupIdx).valid && isWriteTagECC(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
-        array(idx).io.ecc_write.valid := true.B
-        array(idx).io.ecc_write.bits.idx := io.cacheOp.req.bits.index
-        array(idx).io.ecc_write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
-        array(idx).io.ecc_write.bits.ecc := io.cacheOp.req.bits.write_tag_ecc
-        cacheOpShouldResp := true.B
-      }
+  wdataEcc_dup_vec.zipWithIndex.map{ case(dupIdx, idx) =>
+    when(io.cacheOp_req_dup(dupIdx).valid && isWriteTagECC(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
+      array(idx).io.ecc_write.valid := true.B
+      array(idx).io.ecc_write.bits.idx := io.cacheOp.req.bits.index
+      array(idx).io.ecc_write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
+      array(idx).io.ecc_write.bits.ecc := io.cacheOp.req.bits.write_tag_ecc
+      cacheOpShouldResp := true.B
    }
  }


--- a/src/main/scala/xiangshan/cache/mmu/L2TLB.scala
+++ b/src/main/scala/xiangshan/cache/mmu/L2TLB.scala
@@ -29,7 +29,6 @@ import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp}
 import freechips.rocketchip.tilelink._
 import xiangshan.backend.fu.{PMP, PMPChecker, PMPReqBundle, PMPRespBundle}
 import xiangshan.backend.fu.util.HasCSRConst
-import utility.ChiselDB
 import difftest._

 class L2TLB()(implicit p: Parameters) extends LazyModule with HasPtwConst {
@@ -38,7 +37,8 @@ class L2TLB()(implicit p: Parameters) extends LazyModule with HasPtwConst {
    clients = Seq(TLMasterParameters.v1(
      "ptw",
      sourceId = IdRange(0, MemReqWidth)
-    ))
+    )),
+    requestFields = Seq(ReqSourceField())
  )))

  lazy val module = new L2TLBImp(this)
@@ -136,10 +136,11 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
    prefetch.io.csr := csr_dup(0)
    arb2.io.in(InArbPrefetchPort) <> prefetch.io.out

+    val isWriteL2TlbPrefetchTable = WireInit(Constantin.createRecord("isWriteL2TlbPrefetchTable" + p(XSCoreParamsKey).HartId.toString))
    val L2TlbPrefetchTable = ChiselDB.createTable("L2TlbPrefetch_hart" + p(XSCoreParamsKey).HartId.toString, new L2TlbPrefetchDB)
    val L2TlbPrefetchDB = Wire(new L2TlbPrefetchDB)
    L2TlbPrefetchDB.vpn := prefetch.io.out.bits.vpn
-    L2TlbPrefetchTable.log(L2TlbPrefetchDB, prefetch.io.out.fire, "L2TlbPrefetch", clock, reset)
+    L2TlbPrefetchTable.log(L2TlbPrefetchDB, isWriteL2TlbPrefetchTable.orR && prefetch.io.out.fire, "L2TlbPrefetch", clock, reset)
  }
  arb2.io.out.ready := cache.io.req.ready

@@ -252,6 +253,7 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
  )._2
  mem.a.bits := memRead
  mem.a.valid := mem_arb.io.out.valid && !flush
+  mem.a.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.PTW.id.U)
  mem.d.ready := true.B
  // mem -> data buffer
  val refill_data = Reg(Vec(blockBits / l1BusDataWidth, UInt(l1BusDataWidth.W)))
@@ -443,6 +445,7 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
    ptw_sector_resp.af := pte.entry(OHToUInt(pte.pteidx)).af
    ptw_sector_resp.pf := pte.entry(OHToUInt(pte.pteidx)).pf
    ptw_sector_resp.addr_low := OHToUInt(pte.pteidx)
+    ptw_sector_resp.pteidx := pte.pteidx
    for (i <- 0 until tlbcontiguous) {
      val ppn_equal = pte.entry(i).ppn === pte.entry(OHToUInt(pte.pteidx)).ppn
      val perm_equal = pte.entry(i).perm.getOrElse(0.U.asTypeOf(new PtePermBundle)).asUInt === pte.entry(OHToUInt(pte.pteidx)).perm.getOrElse(0.U.asTypeOf(new PtePermBundle)).asUInt
@@ -496,17 +499,19 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
  val perfEvents  = Seq(llptw, cache, ptw).flatMap(_.getPerfEvents)
  generatePerfEvent()

+  val isWriteL1TlbTable = WireInit(Constantin.createRecord("isWriteL1TlbTable" + p(XSCoreParamsKey).HartId.toString))
  val L1TlbTable = ChiselDB.createTable("L1Tlb_hart" + p(XSCoreParamsKey).HartId.toString, new L1TlbDB)
  val ITlbReqDB, DTlbReqDB, ITlbRespDB, DTlbRespDB = Wire(new L1TlbDB)
  ITlbReqDB.vpn := io.tlb(0).req(0).bits.vpn
  DTlbReqDB.vpn := io.tlb(1).req(0).bits.vpn
  ITlbRespDB.vpn := io.tlb(0).resp.bits.entry.tag
  DTlbRespDB.vpn := io.tlb(1).resp.bits.entry.tag
-  L1TlbTable.log(ITlbReqDB, io.tlb(0).req(0).fire, "ITlbReq", clock, reset)
-  L1TlbTable.log(DTlbReqDB, io.tlb(1).req(0).fire, "DTlbReq", clock, reset)
-  L1TlbTable.log(ITlbRespDB, io.tlb(0).resp.fire, "ITlbResp", clock, reset)
-  L1TlbTable.log(DTlbRespDB, io.tlb(1).resp.fire, "DTlbResp", clock, reset)
+  L1TlbTable.log(ITlbReqDB, isWriteL1TlbTable.orR && io.tlb(0).req(0).fire, "ITlbReq", clock, reset)
+  L1TlbTable.log(DTlbReqDB, isWriteL1TlbTable.orR && io.tlb(1).req(0).fire, "DTlbReq", clock, reset)
+  L1TlbTable.log(ITlbRespDB, isWriteL1TlbTable.orR && io.tlb(0).resp.fire, "ITlbResp", clock, reset)
+  L1TlbTable.log(DTlbRespDB, isWriteL1TlbTable.orR && io.tlb(1).resp.fire, "DTlbResp", clock, reset)

+  val isWritePageCacheTable = WireInit(Constantin.createRecord("isWritePageCacheTable" + p(XSCoreParamsKey).HartId.toString))
  val PageCacheTable = ChiselDB.createTable("PageCache_hart" + p(XSCoreParamsKey).HartId.toString, new PageCacheDB)
  val PageCacheDB = Wire(new PageCacheDB)
  PageCacheDB.vpn := Cat(cache.io.resp.bits.toTlb.entry(0).tag, OHToUInt(cache.io.resp.bits.toTlb.pteidx))
@@ -518,8 +523,9 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
  PageCacheDB.l2Hit := cache.io.resp.bits.toFsm.l2Hit
  PageCacheDB.l1Hit := cache.io.resp.bits.toFsm.l1Hit
  PageCacheDB.hit := cache.io.resp.bits.hit
-  PageCacheTable.log(PageCacheDB, cache.io.resp.fire, "PageCache", clock, reset)
+  PageCacheTable.log(PageCacheDB, isWritePageCacheTable.orR && cache.io.resp.fire, "PageCache", clock, reset)

+  val isWritePTWTable = WireInit(Constantin.createRecord("isWritePTWTable" + p(XSCoreParamsKey).HartId.toString))
  val PTWTable = ChiselDB.createTable("PTW_hart" + p(XSCoreParamsKey).HartId.toString, new PTWDB)
  val PTWReqDB, PTWRespDB, LLPTWReqDB, LLPTWRespDB = Wire(new PTWDB)
  PTWReqDB.vpn := ptw.io.req.bits.req_info.vpn
@@ -530,17 +536,18 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
  LLPTWReqDB.source := llptw.io.in.bits.req_info.source
  LLPTWRespDB.vpn := llptw.io.mem.refill.vpn
  LLPTWRespDB.source := llptw.io.mem.refill.source
-  PTWTable.log(PTWReqDB, ptw.io.req.fire, "PTWReq", clock, reset)
-  PTWTable.log(PTWRespDB, ptw.io.mem.resp.fire, "PTWResp", clock, reset)
-  PTWTable.log(LLPTWReqDB, llptw.io.in.fire, "LLPTWReq", clock, reset)
-  PTWTable.log(LLPTWRespDB, llptw.io.mem.resp.fire, "LLPTWResp", clock, reset)
+  PTWTable.log(PTWReqDB, isWritePTWTable.orR && ptw.io.req.fire, "PTWReq", clock, reset)
+  PTWTable.log(PTWRespDB, isWritePTWTable.orR && ptw.io.mem.resp.fire, "PTWResp", clock, reset)
+  PTWTable.log(LLPTWReqDB, isWritePTWTable.orR && llptw.io.in.fire, "LLPTWReq", clock, reset)
+  PTWTable.log(LLPTWRespDB, isWritePTWTable.orR && llptw.io.mem.resp.fire, "LLPTWResp", clock, reset)

+  val isWriteL2TlbMissQueueTable = WireInit(Constantin.createRecord("isWriteL2TlbMissQueueTable" + p(XSCoreParamsKey).HartId.toString))
  val L2TlbMissQueueTable = ChiselDB.createTable("L2TlbMissQueue_hart" + p(XSCoreParamsKey).HartId.toString, new L2TlbMissQueueDB)
  val L2TlbMissQueueInDB, L2TlbMissQueueOutDB = Wire(new L2TlbMissQueueDB)
  L2TlbMissQueueInDB.vpn := missQueue.io.in.bits.vpn
  L2TlbMissQueueOutDB.vpn := missQueue.io.out.bits.vpn
-  L2TlbMissQueueTable.log(L2TlbMissQueueInDB, missQueue.io.in.fire, "L2TlbMissQueueIn", clock, reset)
-  L2TlbMissQueueTable.log(L2TlbMissQueueOutDB, missQueue.io.out.fire, "L2TlbMissQueueOut", clock, reset)
+  L2TlbMissQueueTable.log(L2TlbMissQueueInDB, isWriteL2TlbMissQueueTable.orR && missQueue.io.in.fire, "L2TlbMissQueueIn", clock, reset)
+  L2TlbMissQueueTable.log(L2TlbMissQueueOutDB, isWriteL2TlbMissQueueTable.orR && missQueue.io.out.fire, "L2TlbMissQueueOut", clock, reset)
 }

 /** BlockHelper, block missqueue, not to send too many req to cache

--- a/src/main/scala/xiangshan/cache/mmu/MMUBundle.scala
+++ b/src/main/scala/xiangshan/cache/mmu/MMUBundle.scala
@@ -81,6 +81,41 @@ class TlbPermBundle(implicit p: Parameters) extends TlbBundle {
  val w = Bool()
  val r = Bool()

+  val pm = new TlbPMBundle
+
+  def apply(item: PtwSectorResp, pm: PMPConfig) = {
+    val ptePerm = item.entry.perm.get.asTypeOf(new PtePermBundle().cloneType)
+    this.pf := item.pf
+    this.af := item.af
+    this.d := ptePerm.d
+    this.a := ptePerm.a
+    this.g := ptePerm.g
+    this.u := ptePerm.u
+    this.x := ptePerm.x
+    this.w := ptePerm.w
+    this.r := ptePerm.r
+
+    this.pm.assign_ap(pm)
+    this
+  }
+  override def toPrintable: Printable = {
+    p"pf:${pf} af:${af} d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r} " +
+    p"pm:${pm}"
+  }
+}
+
+class TlbSectorPermBundle(implicit p: Parameters) extends TlbBundle {
+  val pf = Bool() // NOTE: if this is true, just raise pf
+  val af = Bool() // NOTE: if this is true, just raise af
+  // pagetable perm (software defined)
+  val d = Bool()
+  val a = Bool()
+  val g = Bool()
+  val u = Bool()
+  val x = Bool()
+  val w = Bool()
+  val r = Bool()
+
  // static pmp & pma check has a minimum grain size of 4K
  // So sector tlb will use eight static pm entries
  val pm = Vec(tlbcontiguous, new TlbPMBundle)
@@ -140,6 +175,95 @@ class CAMTemplate[T <: Data](val gen: T, val set: Int, val readWidth: Int)(impli
 class TlbEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters) extends TlbBundle {
  require(pageNormal || pageSuper)

+  val tag = if (!pageNormal) UInt((vpnLen - vpnnLen).W)
+  else UInt(vpnLen.W)
+  val asid = UInt(asidLen.W)
+  val level = if (!pageNormal) Some(UInt(1.W))
+  else if (!pageSuper) None
+  else Some(UInt(2.W))
+  val ppn = if (!pageNormal) UInt((ppnLen - vpnnLen).W)
+  else UInt(ppnLen.W)
+  val perm = new TlbPermBundle
+
+  /** level usage:
+    *  !PageSuper: page is only normal, level is None, match all the tag
+    *  !PageNormal: page is only super, level is a Bool(), match high 9*2 parts
+    *  bits0  0: need mid 9bits
+    *         1: no need mid 9bits
+    *  PageSuper && PageNormal: page hold all the three type,
+    *  bits0  0: need low 9bits
+    *  bits1  0: need mid 9bits
+    */
+
+  def hit(vpn: UInt, asid: UInt, nSets: Int = 1, ignoreAsid: Boolean = false): Bool = {
+    val asid_hit = if (ignoreAsid) true.B else (this.asid === asid)
+
+    // NOTE: for timing, dont care low set index bits at hit check
+    //       do not need store the low bits actually
+    if (!pageSuper) asid_hit && drop_set_equal(vpn, tag, nSets)
+    else if (!pageNormal) {
+      val tag_match_hi = tag(vpnnLen*2-1, vpnnLen) === vpn(vpnnLen*3-1, vpnnLen*2)
+      val tag_match_mi = tag(vpnnLen-1, 0) === vpn(vpnnLen*2-1, vpnnLen)
+      val tag_match = tag_match_hi && (level.get.asBool() || tag_match_mi)
+      asid_hit && tag_match
+    }
+    else {
+      val tmp_level = level.get
+      val tag_match_hi = tag(vpnnLen*3-1, vpnnLen*2) === vpn(vpnnLen*3-1, vpnnLen*2)
+      val tag_match_mi = tag(vpnnLen*2-1, vpnnLen) === vpn(vpnnLen*2-1, vpnnLen)
+      val tag_match_lo = tag(vpnnLen-1, 0) === vpn(vpnnLen-1, 0) // if pageNormal is false, this will always be false
+      val tag_match = tag_match_hi && (tmp_level(1) || tag_match_mi) && (tmp_level(0) || tag_match_lo)
+      asid_hit && tag_match
+    }
+  }
+
+  def apply(item: PtwSectorResp, asid: UInt, pm: PMPConfig): TlbEntry = {
+    this.tag := {if (pageNormal) Cat(item.entry.tag, OHToUInt(item.pteidx)) else item.entry.tag(sectorvpnLen - 1, vpnnLen - sectortlbwidth)}
+    this.asid := asid
+    val inner_level = item.entry.level.getOrElse(0.U)
+    this.level.map(_ := { if (pageNormal && pageSuper) MuxLookup(inner_level, 0.U, Seq(
+      0.U -> 3.U,
+      1.U -> 1.U,
+      2.U -> 0.U ))
+    else if (pageSuper) ~inner_level(0)
+    else 0.U })
+    this.ppn := { if (!pageNormal) item.entry.ppn(sectorppnLen - 1, vpnnLen - sectortlbwidth)
+                  else Cat(item.entry.ppn, item.ppn_low(OHToUInt(item.pteidx))) }
+    this.perm.apply(item, pm)
+    this
+  }
+
+  // 4KB is normal entry, 2MB/1GB is considered as super entry
+  def is_normalentry(): Bool = {
+    if (!pageSuper) { true.B }
+    else if (!pageNormal) { false.B }
+    else { level.get === 0.U }
+  }
+
+  def genPPN(saveLevel: Boolean = false, valid: Bool = false.B)(vpn: UInt) : UInt = {
+    val inner_level = level.getOrElse(0.U)
+    val ppn_res = if (!pageSuper) ppn
+    else if (!pageNormal) Cat(ppn(ppnLen-vpnnLen-1, vpnnLen),
+      Mux(inner_level(0), vpn(vpnnLen*2-1, vpnnLen), ppn(vpnnLen-1,0)),
+      vpn(vpnnLen-1, 0))
+    else Cat(ppn(ppnLen-1, vpnnLen*2),
+      Mux(inner_level(1), vpn(vpnnLen*2-1, vpnnLen), ppn(vpnnLen*2-1, vpnnLen)),
+      Mux(inner_level(0), vpn(vpnnLen-1, 0), ppn(vpnnLen-1, 0)))
+
+    if (saveLevel) Cat(ppn(ppn.getWidth-1, vpnnLen*2), RegEnable(ppn_res(vpnnLen*2-1, 0), valid))
+    else ppn_res
+  }
+
+  override def toPrintable: Printable = {
+    val inner_level = level.getOrElse(2.U)
+    p"asid: ${asid} level:${inner_level} vpn:${Hexadecimal(tag)} ppn:${Hexadecimal(ppn)} perm:${perm}"
+  }
+
+}
+
+class TlbSectorEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters) extends TlbBundle {
+  require(pageNormal || pageSuper)
+
  val tag = if (!pageNormal) UInt((vpnLen - vpnnLen).W)
            else UInt(sectorvpnLen.W)
  val asid = UInt(asidLen.W)
@@ -148,8 +272,9 @@ class TlbEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters)
              else Some(UInt(2.W))
  val ppn = if (!pageNormal) UInt((ppnLen - vpnnLen).W)
            else UInt(sectorppnLen.W)
-  val perm = new TlbPermBundle
+  val perm = new TlbSectorPermBundle
  val valididx = Vec(tlbcontiguous, Bool())
+  val pteidx = Vec(tlbcontiguous, Bool())
  val ppn_low = Vec(tlbcontiguous, UInt(sectortlbwidth.W))

  /** level usage:
@@ -224,7 +349,7 @@ class TlbEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters)
    vpn_hit && index_hit.reduce(_ || _) && PopCount(data.valididx) === 1.U
  }

-  def apply(item: PtwSectorResp, asid: UInt, pm: Seq[PMPConfig]): TlbEntry = {
+  def apply(item: PtwSectorResp, asid: UInt, pm: Seq[PMPConfig]): TlbSectorEntry = {
    this.tag := {if (pageNormal) item.entry.tag else item.entry.tag(sectorvpnLen - 1, vpnnLen - sectortlbwidth)}
    this.asid := asid
    val inner_level = item.entry.level.getOrElse(0.U)
@@ -239,6 +364,7 @@ class TlbEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters)
    this.perm.apply(item, pm)
    this.ppn_low := item.ppn_low
    this.valididx := item.valididx
+    this.pteidx := item.pteidx
    this
  }

@@ -302,7 +428,7 @@ class TlbStorageIO(nSets: Int, nWays: Int, ports: Int, nDups: Int = 1)(implicit
    val resp = Vec(ports, ValidIO(new Bundle{
      val hit = Output(Bool())
      val ppn = Vec(nDups, Output(UInt(ppnLen.W)))
-      val perm = Vec(nDups, Output(new TlbPermBundle()))
+      val perm = Vec(nDups, Output(new TlbSectorPermBundle()))
    }))
  }
  val w = Flipped(ValidIO(new Bundle {
@@ -412,8 +538,8 @@ class MemBlockidxBundle(implicit p: Parameters) extends TlbBundle {
  val is_ld = Bool()
  val is_st = Bool()
  val idx =
-    if (LoadQueueSize >= StoreQueueSize) {
-      val idx = UInt(log2Ceil(LoadQueueSize).W)
+    if (VirtualLoadQueueSize >= StoreQueueSize) {
+      val idx = UInt(log2Ceil(VirtualLoadQueueSize).W)
      idx
    } else {
      val idx = UInt(log2Ceil(StoreQueueSize).W)
@@ -866,6 +992,7 @@ class PtwSectorResp(implicit p: Parameters) extends PtwBundle {
  val addr_low = UInt(sectortlbwidth.W)
  val ppn_low = Vec(tlbcontiguous, UInt(sectortlbwidth.W))
  val valididx = Vec(tlbcontiguous, Bool())
+  val pteidx = Vec(tlbcontiguous, Bool())
  val pf = Bool()
  val af = Bool()


--- a/src/main/scala/xiangshan/cache/mmu/MMUConst.scala
+++ b/src/main/scala/xiangshan/cache/mmu/MMUConst.scala
@@ -67,7 +67,7 @@ case class L2TLBParameters
  spSize: Int = 16,
  spReplacer: Option[String] = Some("plru"),
  // filter
-  ifilterSize: Int = 4,
+  ifilterSize: Int = 8,
  dfilterSize: Int = 8,
  // miss queue, add more entries than 'must require'
  // 0 for easier bug trigger, please set as big as u can, 8 maybe

--- a/src/main/scala/xiangshan/cache/mmu/Repeater.scala
+++ b/src/main/scala/xiangshan/cache/mmu/Repeater.scala
@@ -133,6 +133,7 @@ class PTWRepeaterNB(Width: Int = 1, passReady: Boolean = false, FenceDelay: Int)
 class PTWFilterIO(Width: Int)(implicit p: Parameters) extends MMUIOBaseBundle {
  val tlb = Flipped(new VectorTlbPtwIO(Width))
  val ptw = new TlbPtwIO()
+  val rob_head_miss_in_tlb = Output(Bool())

  def apply(tlb: VectorTlbPtwIO, ptw: TlbPtwIO, sfence: SfenceBundle, csr: TlbCsrBundle): Unit = {
    this.tlb <> tlb
@@ -247,6 +248,7 @@ class PTWFilter(Width: Int, Size: Int, FenceDelay: Int)(implicit p: Parameters)
  io.tlb.resp.bits.data.addr_low := ptwResp.addr_low
  io.tlb.resp.bits.data.ppn_low := ptwResp.ppn_low
  io.tlb.resp.bits.data.valididx := ptwResp.valididx
+  io.tlb.resp.bits.data.pteidx := ptwResp.pteidx
  io.tlb.resp.bits.data.pf := ptwResp.pf
  io.tlb.resp.bits.data.af := ptwResp.af
  io.tlb.resp.bits.data.memidx := memidx(OHToUInt(ptwResp_OldMatchVec))
@@ -323,6 +325,14 @@ class PTWFilter(Width: Int, Size: Int, FenceDelay: Int)(implicit p: Parameters)
    inflight_counter := 0.U
  }

+  val sourceVaddr = WireInit(0.U.asTypeOf(new Valid(UInt(VAddrBits.W))))
+
+  ExcitingUtils.addSink(sourceVaddr, s"rob_head_vaddr_${coreParams.HartId}", ExcitingUtils.Perf)
+
+  io.rob_head_miss_in_tlb := VecInit(v.zip(vpn).map{case (vi, vpni) => {
+    vi && sourceVaddr.valid && vpni === get_pn(sourceVaddr.bits)
+  }}).asUInt.orR
+
  // perf
  XSPerfAccumulate("tlb_req_count", PopCount(Cat(io.tlb.req.map(_.valid))))
  XSPerfAccumulate("tlb_req_count_filtered", Mux(do_enq, accumEnqNum(Width - 1), 0.U))

--- a/src/main/scala/xiangshan/cache/mmu/TLB.scala
+++ b/src/main/scala/xiangshan/cache/mmu/TLB.scala
@@ -73,7 +73,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
  // val vmEnable = satp.mode === 8.U // && (mode < ModeM) // FIXME: fix me when boot xv6/linux...
  val vmEnable = if (EnbaleTlbDebug) (satp.mode === 8.U)
    else (satp.mode === 8.U && (mode < ModeM))
-  val portTranslateEnable = (0 until Width).map(i => vmEnable && !req(i).bits.no_translate)
+  val portTranslateEnable = (0 until Width).map(i => vmEnable && RegNext(!req(i).bits.no_translate))

  val req_in = req
  val req_out = req.map(a => RegEnable(a.bits, a.fire()))

--- a/src/main/scala/xiangshan/cache/mmu/TLBStorage.scala
+++ b/src/main/scala/xiangshan/cache/mmu/TLBStorage.scala
@@ -98,7 +98,7 @@ class TLBFA(
  io.r.req.map(_.ready := true.B)

  val v = RegInit(VecInit(Seq.fill(nWays)(false.B)))
-  val entries = Reg(Vec(nWays, new TlbEntry(normalPage, superPage)))
+  val entries = Reg(Vec(nWays, new TlbSectorEntry(normalPage, superPage)))
  val g = entries.map(_.perm.g)

  for (i <- 0 until ports) {
@@ -186,14 +186,21 @@ class TLBFA(
  io.victim.out.valid := v(victim_idx) && io.w.valid && entries(victim_idx).is_normalentry()
  io.victim.out.bits.entry := ns_to_n(entries(victim_idx))

-  def ns_to_n(ns: TlbEntry): TlbEntry = {
+  def ns_to_n(ns: TlbSectorEntry): TlbEntry = {
    val n = Wire(new TlbEntry(pageNormal = true, pageSuper = false))
-    n.perm := ns.perm
-    n.ppn := ns.ppn
-    n.tag := ns.tag
+    n.perm.af := ns.perm.af
+    n.perm.pf := ns.perm.pf
+    n.perm.d := ns.perm.d
+    n.perm.a := ns.perm.a
+    n.perm.g := ns.perm.g
+    n.perm.u := ns.perm.u
+    n.perm.x := ns.perm.x
+    n.perm.w := ns.perm.w
+    n.perm.r := ns.perm.r
+    n.perm.pm := ns.perm.pm(OHToUInt(ns.pteidx))
+    n.ppn := Cat(ns.ppn, ns.ppn_low(OHToUInt(ns.pteidx)))
+    n.tag := Cat(ns.tag, OHToUInt(ns.pteidx))
    n.asid := ns.asid
-    n.valididx := ns.valididx
-    n.ppn_low := ns.ppn_low
    n
  }

@@ -249,10 +256,10 @@ class TLBSA(
    val vpn = req.bits.vpn
    val vpn_reg = RegEnable(vpn, req.fire())

-    val ridx = get_set_idx(vpn(vpn.getWidth - 1, sectortlbwidth), nSets)
+    val ridx = get_set_idx(vpn, nSets)
    val v_resize = v.asTypeOf(Vec(VPRE_SELECT, Vec(VPOST_SELECT, UInt(nWays.W))))
-    val vidx_resize = RegNext(v_resize(get_set_idx(drop_set_idx(vpn(vpn.getWidth - 1, sectortlbwidth), VPOST_SELECT), VPRE_SELECT)))
-    val vidx = vidx_resize(get_set_idx(vpn_reg(vpn_reg.getWidth - 1, sectortlbwidth), VPOST_SELECT)).asBools.map(_ && RegNext(req.fire()))
+    val vidx_resize = RegNext(v_resize(get_set_idx(drop_set_idx(vpn, VPOST_SELECT), VPRE_SELECT)))
+    val vidx = vidx_resize(get_set_idx(vpn_reg, VPOST_SELECT)).asBools.map(_ && RegNext(req.fire()))
    val vidx_bypass = RegNext((entries.io.waddr === ridx) && entries.io.wen)
    entries.io.raddr(i) := ridx

@@ -261,7 +268,18 @@ class TLBSA(
    resp.bits.hit := hit
    for (d <- 0 until nDups) {
      resp.bits.ppn(d) := data(d).genPPN()(vpn_reg)
-      resp.bits.perm(d) := data(d).perm
+      resp.bits.perm(d).pf := data(d).perm.pf
+      resp.bits.perm(d).af := data(d).perm.af
+      resp.bits.perm(d).d := data(d).perm.d
+      resp.bits.perm(d).a := data(d).perm.a
+      resp.bits.perm(d).g := data(d).perm.g
+      resp.bits.perm(d).u := data(d).perm.u
+      resp.bits.perm(d).x := data(d).perm.x
+      resp.bits.perm(d).w := data(d).perm.w
+      resp.bits.perm(d).r := data(d).perm.r
+      for (i <- 0 until tlbcontiguous) {
+        resp.bits.perm(d).pm(i) := data(d).perm.pm
+      }
    }

    resp.valid := { RegNext(req.valid) }
@@ -269,7 +287,7 @@ class TLBSA(
    resp.bits.ppn.suggestName("ppn")
    resp.bits.perm.suggestName("perm")

-    access.sets := get_set_idx(vpn_reg(vpn_reg.getWidth - 1, sectortlbwidth), nSets) // no use
+    access.sets := get_set_idx(vpn_reg, nSets) // no use
    access.touch_ways.valid := resp.valid && hit
    access.touch_ways.bits := 1.U // TODO: set-assoc need no replacer when nset is 1
  }
@@ -280,7 +298,7 @@ class TLBSA(
    get_set_idx(io.w.bits.data.entry.tag, nSets),
    get_set_idx(io.victim.in.bits.entry.tag, nSets))
  entries.io.wdata := Mux(io.w.valid,
-    (Wire(new TlbEntry(normalPage, superPage)).apply(io.w.bits.data, io.csr.satp.asid, io.w.bits.data_replenish)),
+    (Wire(new TlbEntry(normalPage, superPage)).apply(io.w.bits.data, io.csr.satp.asid, io.w.bits.data_replenish(OHToUInt(io.w.bits.data.pteidx)))),
    io.victim.in.bits.entry)

  when (io.victim.in.valid) {
@@ -303,13 +321,12 @@ class TLBSA(

  val sfence = io.sfence
  val sfence_vpn = sfence.bits.addr.asTypeOf(new VaBundle().cloneType).vpn
-  // Sfence will flush all sectors of an entry when hit
  when (io.sfence.valid) {
    when (sfence.bits.rs1) { // virtual address *.rs1 <- (rs1===0.U)
        v.map(a => a.map(b => b := false.B))
    }.otherwise {
        // specific addr but all asid
-        v(get_set_idx(sfence_vpn(sfence_vpn.getWidth - 1, sectortlbwidth), nSets)).map(_ := false.B)
+        v(get_set_idx(sfence_vpn, nSets)).map(_ := false.B)
    }
  }

@@ -327,7 +344,7 @@ class TLBSA(

  for (i <- 0 until nSets) {
    XSPerfAccumulate(s"hit${i}", io.r.resp.map(a => a.valid & a.bits.hit)
-      .zip(io.r.req.map(a => RegNext(get_set_idx(a.bits.vpn(a.bits.vpn.getWidth - 1, sectortlbwidth), nSets)) === i.U))
+      .zip(io.r.req.map(a => RegNext(get_set_idx(a.bits.vpn, nSets)) === i.U))
      .map{a => (a._1 && a._2).asUInt()}
      .fold(0.U)(_ + _)
    )
@@ -335,7 +352,7 @@ class TLBSA(

  for (i <- 0 until nSets) {
    XSPerfAccumulate(s"access${i}", io.r.resp.map(_.valid)
-      .zip(io.r.req.map(a => RegNext(get_set_idx(a.bits.vpn(a.bits.vpn.getWidth - 1, sectortlbwidth), nSets)) === i.U))
+      .zip(io.r.req.map(a => RegNext(get_set_idx(a.bits.vpn, nSets)) === i.U))
      .map{a => (a._1 && a._2).asUInt()}
      .fold(0.U)(_ + _)
    )
@@ -508,11 +525,20 @@ class TlbStorageWrapper(ports: Int, q: TLBParameters, nDups: Int = 1)(implicit p
    rp.bits.hit := np.bits.hit || sp.bits.hit
    for (d <- 0 until nDups) {
      rp.bits.ppn(d) := Mux(sp.bits.hit, sp.bits.ppn(0), np.bits.ppn(d))
-      rp.bits.perm(d) := Mux(sp.bits.hit, sp.bits.perm(0), np.bits.perm(d))
+      rp.bits.perm(d).pf := Mux(sp.bits.hit, sp.bits.perm(0).pf, np.bits.perm(d).pf)
+      rp.bits.perm(d).af := Mux(sp.bits.hit, sp.bits.perm(0).af, np.bits.perm(d).af)
+      rp.bits.perm(d).d := Mux(sp.bits.hit, sp.bits.perm(0).d, np.bits.perm(d).d)
+      rp.bits.perm(d).a := Mux(sp.bits.hit, sp.bits.perm(0).a, np.bits.perm(d).a)
+      rp.bits.perm(d).g := Mux(sp.bits.hit, sp.bits.perm(0).g, np.bits.perm(d).g)
+      rp.bits.perm(d).u := Mux(sp.bits.hit, sp.bits.perm(0).u, np.bits.perm(d).u)
+      rp.bits.perm(d).x := Mux(sp.bits.hit, sp.bits.perm(0).x, np.bits.perm(d).x)
+      rp.bits.perm(d).w := Mux(sp.bits.hit, sp.bits.perm(0).w, np.bits.perm(d).w)
+      rp.bits.perm(d).r := Mux(sp.bits.hit, sp.bits.perm(0).r, np.bits.perm(d).r)
+      rp.bits.perm(d).pm := DontCare
    }
    rp.bits.super_hit := sp.bits.hit
    rp.bits.super_ppn := sp.bits.ppn(0)
-    rp.bits.spm := np.bits.perm(0).pm(RegNext(io.r.req(i).bits.vpn(sectortlbwidth - 1, 0)))
+    rp.bits.spm := np.bits.perm(0).pm(0)
    // Sector tlb may trigger multi-hit, see def "wbhit"
    XSPerfAccumulate(s"port${i}_np_sp_multi_hit", !(!np.bits.hit || !sp.bits.hit || !rp.valid))
    //assert(!np.bits.hit || !sp.bits.hit || !rp.valid, s"${q.name} storage ports${i} normal and super multi-hit")

--- a/src/main/scala/xiangshan/frontend/BPU.scala
+++ b/src/main/scala/xiangshan/frontend/BPU.scala
@@ -25,6 +25,7 @@ import utils._
 import utility._

 import scala.math.min
+import xiangshan.backend.decode.ImmUnion

 trait HasBPUConst extends HasXSParameter {
  val MaxMetaLength = if (!env.FPGAPlatform) 512 else 256 // TODO: Reduce meta length
@@ -244,6 +245,43 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H
  val ctrl = DelayN(io.ctrl, 1)
  val predictors = Module(if (useBPD) new Composer else new FakePredictor)

+  def numOfStage = 3
+  require(numOfStage > 1, "BPU numOfStage must be greater than 1")
+  val topdown_stages = RegInit(VecInit(Seq.fill(numOfStage)(0.U.asTypeOf(new FrontendTopDownBundle))))
+  dontTouch(topdown_stages)
+
+  // following can only happen on s1
+  val controlRedirectBubble = Wire(Bool())
+  val ControlBTBMissBubble = Wire(Bool())
+  val TAGEMissBubble = Wire(Bool())
+  val SCMissBubble = Wire(Bool())
+  val ITTAGEMissBubble = Wire(Bool())
+  val RASMissBubble = Wire(Bool())
+
+  val memVioRedirectBubble = Wire(Bool())
+  val otherRedirectBubble = Wire(Bool())
+  val btbMissBubble = Wire(Bool())
+  otherRedirectBubble := false.B
+  memVioRedirectBubble := false.B
+
+  // override can happen between s1-s2 and s2-s3
+  val overrideBubble = Wire(Vec(numOfStage - 1, Bool()))
+  def overrideStage = 1
+  // ftq update block can happen on s1, s2 and s3
+  val ftqUpdateBubble = Wire(Vec(numOfStage, Bool()))
+  def ftqUpdateStage = 0
+  // ftq full stall only happens on s3 (last stage)
+  val ftqFullStall = Wire(Bool())
+
+  // by default, no bubble event
+  topdown_stages(0) := 0.U.asTypeOf(new FrontendTopDownBundle)
+  // event movement driven by clock only
+  for (i <- 0 until numOfStage - 1) {
+    topdown_stages(i + 1) := topdown_stages(i)
+  }
+  
+
+
  // ctrl signal
  predictors.io.ctrl := ctrl
  predictors.io.reset_vector := io.reset_vector
@@ -644,6 +682,74 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H
    }
  }

+  // Commit time history checker
+  if (EnableCommitGHistDiff) {
+    val commitGHist = RegInit(0.U.asTypeOf(Vec(HistoryLength, Bool())))
+    val commitGHistPtr = RegInit(0.U.asTypeOf(new CGHPtr))
+    def getCommitHist(ptr: CGHPtr): UInt =
+      (Cat(commitGHist.asUInt, commitGHist.asUInt) >> (ptr.value+1.U))(HistoryLength-1, 0)
+
+    val updateValid        : Bool      = io.ftq_to_bpu.update.valid
+    val branchValidMask    : UInt      = io.ftq_to_bpu.update.bits.ftb_entry.brValids.asUInt
+    val branchCommittedMask: Vec[Bool] = io.ftq_to_bpu.update.bits.br_committed
+    val misPredictMask     : UInt      = io.ftq_to_bpu.update.bits.mispred_mask.asUInt
+    val takenMask          : UInt      =
+      io.ftq_to_bpu.update.bits.br_taken_mask.asUInt |
+        io.ftq_to_bpu.update.bits.ftb_entry.always_taken.asUInt // Always taken branch is recorded in history
+    val takenIdx       : UInt = (PriorityEncoder(takenMask) + 1.U((log2Ceil(numBr)+1).W)).asUInt
+    val misPredictIdx  : UInt = (PriorityEncoder(misPredictMask) + 1.U((log2Ceil(numBr)+1).W)).asUInt
+    val shouldShiftMask: UInt = Mux(takenMask.orR,
+        LowerMask(takenIdx).asUInt,
+        ((1 << numBr) - 1).asUInt) &
+      Mux(misPredictMask.orR,
+        LowerMask(misPredictIdx).asUInt,
+        ((1 << numBr) - 1).asUInt) &
+      branchCommittedMask.asUInt
+    val updateShift    : UInt   =
+      Mux(updateValid && branchValidMask.orR, PopCount(branchValidMask & shouldShiftMask), 0.U)
+    dontTouch(updateShift)
+    dontTouch(commitGHist)
+    dontTouch(commitGHistPtr)
+    dontTouch(takenMask)
+    dontTouch(branchValidMask)
+    dontTouch(branchCommittedMask)
+
+    // Maintain the commitGHist
+    for (i <- 0 until numBr) {
+      when(updateShift >= (i + 1).U) {
+        val ptr: CGHPtr = commitGHistPtr - i.asUInt
+        commitGHist(ptr.value) := takenMask(i)
+      }
+    }
+    when(updateValid) {
+      commitGHistPtr := commitGHistPtr - updateShift
+    }
+
+    // Calculate true history using Parallel XOR
+    def computeFoldedHist(hist: UInt, compLen: Int)(histLen: Int): UInt = {
+      if (histLen > 0) {
+        val nChunks     = (histLen + compLen - 1) / compLen
+        val hist_chunks = (0 until nChunks) map { i =>
+          hist(min((i + 1) * compLen, histLen) - 1, i * compLen)
+        }
+        ParallelXOR(hist_chunks)
+      }
+      else 0.U
+    }
+    // Do differential
+    val predictFHistAll: AllFoldedHistories = io.ftq_to_bpu.update.bits.spec_info.folded_hist
+    TageTableInfos.map {
+      case (nRows, histLen, _) => {
+        val nRowsPerBr = nRows / numBr
+        val commitTrueHist: UInt = computeFoldedHist(getCommitHist(commitGHistPtr), log2Ceil(nRowsPerBr))(histLen)
+        val predictFHist         : UInt = predictFHistAll.
+          getHistWithInfo((histLen, min(histLen, log2Ceil(nRowsPerBr)))).folded_hist
+        XSWarn(updateValid && predictFHist =/= commitTrueHist,
+          p"predict time ghist: ${predictFHist} is different from commit time: ${commitTrueHist}\n")
+      }
+    }
+  }
+

  // val updatedGh = oldGh.update(shift, taken && addIntoHist)

@@ -677,6 +783,87 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H
    }
  }

+  // TODO: signals for memVio and other Redirects
+  controlRedirectBubble := do_redirect.valid && do_redirect.bits.ControlRedirectBubble
+  ControlBTBMissBubble := do_redirect.bits.ControlBTBMissBubble
+  TAGEMissBubble := do_redirect.bits.TAGEMissBubble
+  SCMissBubble := do_redirect.bits.SCMissBubble
+  ITTAGEMissBubble := do_redirect.bits.ITTAGEMissBubble
+  RASMissBubble := do_redirect.bits.RASMissBubble
+
+  memVioRedirectBubble := do_redirect.valid && do_redirect.bits.MemVioRedirectBubble
+  otherRedirectBubble := do_redirect.valid && do_redirect.bits.OtherRedirectBubble
+  btbMissBubble := do_redirect.valid && do_redirect.bits.BTBMissBubble
+  overrideBubble(0) := s2_redirect
+  overrideBubble(1) := s3_redirect
+  ftqUpdateBubble(0) := !s1_components_ready
+  ftqUpdateBubble(1) := !s2_components_ready
+  ftqUpdateBubble(2) := !s3_components_ready
+  ftqFullStall := !io.bpu_to_ftq.resp.ready
+  io.bpu_to_ftq.resp.bits.topdown_info := topdown_stages(numOfStage - 1)
+
+  // topdown handling logic here
+  when (controlRedirectBubble) {
+    /*
+    for (i <- 0 until numOfStage)
+      topdown_stages(i).reasons(TopDownCounters.ControlRedirectBubble.id) := true.B
+    io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.ControlRedirectBubble.id) := true.B
+    */
+    when (ControlBTBMissBubble) {
+      for (i <- 0 until numOfStage)
+        topdown_stages(i).reasons(TopDownCounters.BTBMissBubble.id) := true.B
+      io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.BTBMissBubble.id) := true.B
+    } .elsewhen (TAGEMissBubble) {
+      for (i <- 0 until numOfStage)
+        topdown_stages(i).reasons(TopDownCounters.TAGEMissBubble.id) := true.B
+      io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.TAGEMissBubble.id) := true.B
+    } .elsewhen (SCMissBubble) {
+      for (i <- 0 until numOfStage)
+        topdown_stages(i).reasons(TopDownCounters.SCMissBubble.id) := true.B
+      io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.SCMissBubble.id) := true.B
+    } .elsewhen (ITTAGEMissBubble) {
+      for (i <- 0 until numOfStage)
+        topdown_stages(i).reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
+      io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
+    } .elsewhen (RASMissBubble) {
+      for (i <- 0 until numOfStage)
+        topdown_stages(i).reasons(TopDownCounters.RASMissBubble.id) := true.B
+      io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.RASMissBubble.id) := true.B
+    }
+  }
+  when (memVioRedirectBubble) {
+    for (i <- 0 until numOfStage)
+      topdown_stages(i).reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
+    io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
+  }
+  when (otherRedirectBubble) {
+    for (i <- 0 until numOfStage)
+      topdown_stages(i).reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
+    io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
+  }
+  when (btbMissBubble) {
+    for (i <- 0 until numOfStage)
+      topdown_stages(i).reasons(TopDownCounters.BTBMissBubble.id) := true.B
+    io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.BTBMissBubble.id) := true.B
+  }
+
+  for (i <- 0 until numOfStage) {
+    if (i < numOfStage - overrideStage) {
+      when (overrideBubble(i)) {
+        for (j <- 0 to i)
+          topdown_stages(j).reasons(TopDownCounters.OverrideBubble.id) := true.B
+      }
+    }
+    if (i < numOfStage - ftqUpdateStage) {
+      when (ftqUpdateBubble(i)) {
+        topdown_stages(i).reasons(TopDownCounters.FtqUpdateBubble.id) := true.B
+      }
+    }
+  }
+  when (ftqFullStall) {
+    topdown_stages(0).reasons(TopDownCounters.FtqFullStall.id) := true.B
+  }
+
  XSError(isBefore(redirect.cfiUpdate.histPtr, s3_ghist_ptr) && do_redirect.valid, p"s3_ghist_ptr ${s3_ghist_ptr} exceeds redirect histPtr ${redirect.cfiUpdate.histPtr}\n")
  XSError(isBefore(redirect.cfiUpdate.histPtr, s2_ghist_ptr) && do_redirect.valid, p"s2_ghist_ptr ${s2_ghist_ptr} exceeds redirect histPtr ${redirect.cfiUpdate.histPtr}\n")
  XSError(isBefore(redirect.cfiUpdate.histPtr, s1_ghist_ptr) && do_redirect.valid, p"s1_ghist_ptr ${s1_ghist_ptr} exceeds redirect histPtr ${redirect.cfiUpdate.histPtr}\n")

--- a/src/main/scala/xiangshan/frontend/FTB.scala
+++ b/src/main/scala/xiangshan/frontend/FTB.scala
@@ -56,6 +56,8 @@ class FtbSlot(val offsetLen: Int, val subOffsetLen: Option[Int] = None)(implicit
  val sharing = Bool()
  val valid   = Bool()

+  val sc      = Bool() // set by sc in s3, perf use only
+
  def setLowerStatByTarget(pc: UInt, target: UInt, isShare: Boolean) = {
    def getTargetStatByHigher(pc_higher: UInt, target_higher: UInt) =
      Mux(target_higher > pc_higher, TAR_OVF,

--- a/src/main/scala/xiangshan/frontend/Frontend.scala
+++ b/src/main/scala/xiangshan/frontend/Frontend.scala
@@ -44,7 +44,7 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
    val hartId = Input(UInt(8.W))
    val reset_vector = Input(UInt(PAddrBits.W))
    val fencei = Input(Bool())
-    val ptw = new VectorTlbPtwIO(4)
+    val ptw = new VectorTlbPtwIO(coreParams.itlbPortNum)
    val backend = new FrontendToCtrlIO
    val sfence = Input(new SfenceBundle)
    val tlbCsr = Input(new TlbCsrBundle)
@@ -69,6 +69,13 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
  val ftq = Module(new Ftq)

  val needFlush = RegNext(io.backend.toFtq.redirect.valid)
+  val FlushControlRedirect = RegNext(io.backend.toFtq.redirect.bits.debugIsCtrl)
+  val FlushMemVioRedirect = RegNext(io.backend.toFtq.redirect.bits.debugIsMemVio)
+  val FlushControlBTBMiss = Wire(Bool())
+  val FlushTAGEMiss = Wire(Bool())
+  val FlushSCMiss = Wire(Bool())
+  val FlushITTAGEMiss = Wire(Bool())
+  val FlushRASMiss = Wire(Bool())

  val tlbCsr = DelayN(io.tlbCsr, 2)
  val csrCtrl = DelayN(io.csrCtrl, 2)
@@ -84,26 +91,24 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
  bpu.io.reset_vector := io.reset_vector

 // pmp
+  val prefetchPipeNum = ICacheParameters().prefetchPipeNum
  val pmp = Module(new PMP())
-  val pmp_check = VecInit(Seq.fill(4)(Module(new PMPChecker(3, sameCycle = true)).io))
+  val pmp_check = VecInit(Seq.fill(coreParams.ipmpPortNum)(Module(new PMPChecker(3, sameCycle = true)).io))
  pmp.io.distribute_csr := csrCtrl.distribute_csr
-  val pmp_req_vec     = Wire(Vec(4, Valid(new PMPReqBundle())))
-  pmp_req_vec(0) <> icache.io.pmp(0).req
-  pmp_req_vec(1) <> icache.io.pmp(1).req
-  pmp_req_vec(2) <> icache.io.pmp(2).req
-  pmp_req_vec(3) <> ifu.io.pmp.req
+  val pmp_req_vec     = Wire(Vec(coreParams.ipmpPortNum, Valid(new PMPReqBundle())))
+  (0 until 2 + prefetchPipeNum).foreach(i => pmp_req_vec(i) <> icache.io.pmp(i).req)
+  pmp_req_vec.last <> ifu.io.pmp.req

  for (i <- pmp_check.indices) {
    pmp_check(i).apply(tlbCsr.priv.imode, pmp.io.pmp, pmp.io.pma, pmp_req_vec(i))
  }
-  icache.io.pmp(0).resp <> pmp_check(0).resp
-  icache.io.pmp(1).resp <> pmp_check(1).resp
-  icache.io.pmp(2).resp <> pmp_check(2).resp
-  ifu.io.pmp.resp <> pmp_check(3).resp
-
-  val itlb = Module(new TLB(4, nRespDups = 1, Seq(true, true, false, true), itlbParams))
-  itlb.io.requestor.take(3) zip icache.io.itlb foreach {case (a,b) => a <> b}
-  itlb.io.requestor(3) <> ifu.io.iTLBInter // mmio may need re-tlb, blocked
+  (0 until 2 + prefetchPipeNum).foreach(i => icache.io.pmp(i).resp <> pmp_check(i).resp)
+  ifu.io.pmp.resp <> pmp_check.last.resp
+
+  val itlb = Module(new TLB(coreParams.itlbPortNum, nRespDups = 1,
+    Seq(true, true) ++ Seq.fill(prefetchPipeNum)(false) ++ Seq(true), itlbParams))
+  itlb.io.requestor.take(2 + prefetchPipeNum) zip icache.io.itlb foreach {case (a,b) => a <> b}
+  itlb.io.requestor.last <> ifu.io.iTLBInter // mmio may need re-tlb, blocked
  itlb.io.base_connect(io.sfence, tlbCsr)
  io.ptw.connect(itlb.io.ptw)
  itlb.io.ptw_replenish <> DontCare
@@ -128,6 +133,8 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)

  ifu.io.icacheInter.resp <>    icache.io.fetch.resp
  ifu.io.icacheInter.icacheReady :=  icache.io.toIFU
+  ifu.io.icacheInter.topdownIcacheMiss := icache.io.fetch.topdownIcacheMiss
+  ifu.io.icacheInter.topdownItlbMiss := icache.io.fetch.topdownItlbMiss
  icache.io.stop := ifu.io.icacheStop

  ifu.io.icachePerfInfo := icache.io.perfInfo
@@ -138,6 +145,8 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
  icache.io.csr_pf_enable     := RegNext(csrCtrl.l1I_pf_enable)
  icache.io.csr_parity_enable := RegNext(csrCtrl.icache_parity_enable)

+  icache.io.fencei := io.fencei
+
  //IFU-Ibuffer
  ifu.io.toIbuffer    <> ibuffer.io.in

@@ -148,7 +157,23 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
  ifu.io.rob_commits <> io.backend.toFtq.rob_commits

  ibuffer.io.flush := needFlush
+  ibuffer.io.ControlRedirect := FlushControlRedirect
+  ibuffer.io.MemVioRedirect := FlushMemVioRedirect
+  ibuffer.io.ControlBTBMissBubble := FlushControlBTBMiss
+  ibuffer.io.TAGEMissBubble := FlushTAGEMiss
+  ibuffer.io.SCMissBubble := FlushSCMiss
+  ibuffer.io.ITTAGEMissBubble := FlushITTAGEMiss
+  ibuffer.io.RASMissBubble := FlushRASMiss
+
+  FlushControlBTBMiss := ftq.io.ControlBTBMissBubble
+  FlushTAGEMiss := ftq.io.TAGEMissBubble
+  FlushSCMiss := ftq.io.SCMissBubble
+  FlushITTAGEMiss := ftq.io.ITTAGEMissBubble
+  FlushRASMiss := ftq.io.RASMissBubble
+
  io.backend.cfVec <> ibuffer.io.out
+  io.backend.stallReason <> ibuffer.io.stallReason
+  dontTouch(io.backend.stallReason)

  instrUncache.io.req   <> ifu.io.uncacheInter.toUncache
  ifu.io.uncacheInter.fromUncache <> instrUncache.io.resp

--- a/src/main/scala/xiangshan/frontend/FrontendBundle.scala
+++ b/src/main/scala/xiangshan/frontend/FrontendBundle.scala
@@ -24,6 +24,12 @@ import xiangshan.frontend.icache._
 import utils._
 import utility._
 import scala.math._
+import java.util.ResourceBundle.Control
+
+class FrontendTopDownBundle(implicit p: Parameters) extends XSBundle {
+  val reasons = Vec(TopDownCounters.NumStallReasons.id, Bool())
+  val stallWidth = UInt(log2Ceil(PredictWidth).W)
+}

 @chiselName
 class FetchRequestBundle(implicit p: Parameters) extends XSBundle with HasICacheParameters {
@@ -36,6 +42,8 @@ class FetchRequestBundle(implicit p: Parameters) extends XSBundle with HasICache
  val ftqIdx          = new FtqPtr
  val ftqOffset       = ValidUndirectioned(UInt(log2Ceil(PredictWidth).W))

+  val topdown_info    = new FrontendTopDownBundle
+
  def crossCacheline =  startAddr(blockOffBits - 1) === 1.U

  def fromFtqPcBundle(b: Ftq_RF_Components) = {
@@ -74,6 +82,8 @@ class FtqICacheInfo(implicit p: Parameters)extends XSBundle with HasICacheParame
 class IFUICacheIO(implicit p: Parameters)extends XSBundle with HasICacheParameters{
  val icacheReady       = Output(Bool())
  val resp              = Vec(PortNumber, ValidIO(new ICacheMainPipeResp))
+  val topdownIcacheMiss = Output(Bool())
+  val topdownItlbMiss = Output(Bool())
 }

 class FtqToICacheRequestBundle(implicit p: Parameters)extends XSBundle with HasICacheParameters{
@@ -121,6 +131,8 @@ class FetchToIBuffer(implicit p: Parameters) extends XSBundle {
  val acf          = Vec(PredictWidth, Bool())
  val crossPageIPFFix = Vec(PredictWidth, Bool())
  val triggered    = Vec(PredictWidth, new TriggerCf)
+
+  val topdown_info = new FrontendTopDownBundle
 }

 // class BitWiseUInt(val width: Int, val init: UInt) extends Module {
@@ -569,6 +581,8 @@ class BranchPredictionResp(implicit p: Parameters) extends XSBundle with HasBPUC
  val last_stage_spec_info = new SpeculativeInfo
  val last_stage_ftb_entry = new FTBEntry

+  val topdown_info = new FrontendTopDownBundle
+
  def selectedResp ={
    val res =
      PriorityMux(Seq(
@@ -596,6 +610,7 @@ class BranchPredictionUpdate(implicit p: Parameters) extends XSBundle with HasBP

  val cfi_idx = ValidUndirectioned(UInt(log2Ceil(PredictWidth).W))
  val br_taken_mask = Vec(numBr, Bool())
+  val br_committed = Vec(numBr, Bool()) // High only when br valid && br committed
  val jmp_taken = Bool()
  val mispred_mask = Vec(numBr+1, Bool())
  val pred_hit = Bool()
@@ -638,6 +653,28 @@ class BranchPredictionRedirect(implicit p: Parameters) extends Redirect with Has

  // }

+  // TODO: backend should pass topdown signals here
+  // must not change its parent since BPU has used asTypeOf(this type) from its parent class
+  require(isInstanceOf[Redirect])
+  val BTBMissBubble = Bool()
+  def ControlRedirectBubble = debugIsCtrl
+  // if mispred br not in ftb, count as BTB miss
+  def ControlBTBMissBubble = ControlRedirectBubble && !cfiUpdate.br_hit && !cfiUpdate.jr_hit
+  def TAGEMissBubble = ControlRedirectBubble && cfiUpdate.br_hit && !cfiUpdate.sc_hit
+  def SCMissBubble = ControlRedirectBubble && cfiUpdate.br_hit && cfiUpdate.sc_hit
+  def ITTAGEMissBubble = ControlRedirectBubble && cfiUpdate.jr_hit && !cfiUpdate.pd.isRet
+  def RASMissBubble = ControlRedirectBubble && cfiUpdate.jr_hit && cfiUpdate.pd.isRet
+  def MemVioRedirectBubble = debugIsMemVio
+  def OtherRedirectBubble = !debugIsCtrl && !debugIsMemVio
+
+  def connectRedirect(source: Redirect): Unit = {
+    for ((name, data) <- this.elements) {
+      if (source.elements.contains(name)) {
+        data := source.elements(name)
+      }
+    }
+  }
+
  def display(cond: Bool): Unit = {
    XSDebug(cond, p"-----------BranchPredictionRedirect----------- \n")
    XSDebug(cond, p"-----------cfiUpdate----------- \n")

--- a/src/main/scala/xiangshan/frontend/IFU.scala
+++ b/src/main/scala/xiangshan/frontend/IFU.scala
@@ -133,6 +133,74 @@ class NewIFU(implicit p: Parameters) extends XSModule

  def isLastInCacheline(addr: UInt): Bool = addr(blockOffBits - 1, 1) === 0.U

+  def numOfStage = 3
+  require(numOfStage > 1, "BPU numOfStage must be greater than 1")
+  val topdown_stages = RegInit(VecInit(Seq.fill(numOfStage)(0.U.asTypeOf(new FrontendTopDownBundle))))
+  dontTouch(topdown_stages)
+  // bubble events in IFU, only happen in stage 1
+  val icacheMissBubble = Wire(Bool())
+  val itlbMissBubble =Wire(Bool())
+
+  // only driven by clock, not valid-ready
+  topdown_stages(0) := fromFtq.req.bits.topdown_info
+  for (i <- 1 until numOfStage) {
+    topdown_stages(i) := topdown_stages(i - 1)
+  }
+  when (icacheMissBubble) {
+    topdown_stages(1).reasons(TopDownCounters.ICacheMissBubble.id) := true.B
+  }
+  when (itlbMissBubble) {
+    topdown_stages(1).reasons(TopDownCounters.ITLBMissBubble.id) := true.B
+  }
+  io.toIbuffer.bits.topdown_info := topdown_stages(numOfStage - 1)
+  when (fromFtq.topdown_redirect.valid) {
+    // only redirect from backend, IFU redirect itself is handled elsewhere
+    when (fromFtq.topdown_redirect.bits.debugIsCtrl) {
+      /*
+      for (i <- 0 until numOfStage) {
+        topdown_stages(i).reasons(TopDownCounters.ControlRedirectBubble.id) := true.B
+      }
+      io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.ControlRedirectBubble.id) := true.B
+      */
+      when (fromFtq.topdown_redirect.bits.ControlBTBMissBubble) {
+        for (i <- 0 until numOfStage) {
+          topdown_stages(i).reasons(TopDownCounters.BTBMissBubble.id) := true.B
+        }
+        io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.BTBMissBubble.id) := true.B
+      } .elsewhen (fromFtq.topdown_redirect.bits.TAGEMissBubble) {
+        for (i <- 0 until numOfStage) {
+          topdown_stages(i).reasons(TopDownCounters.TAGEMissBubble.id) := true.B
+        }
+        io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.TAGEMissBubble.id) := true.B
+      } .elsewhen (fromFtq.topdown_redirect.bits.SCMissBubble) {
+        for (i <- 0 until numOfStage) {
+          topdown_stages(i).reasons(TopDownCounters.SCMissBubble.id) := true.B
+        }
+        io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.SCMissBubble.id) := true.B
+      } .elsewhen (fromFtq.topdown_redirect.bits.ITTAGEMissBubble) {
+        for (i <- 0 until numOfStage) {
+          topdown_stages(i).reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
+        }
+        io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
+      } .elsewhen (fromFtq.topdown_redirect.bits.RASMissBubble) {
+        for (i <- 0 until numOfStage) {
+          topdown_stages(i).reasons(TopDownCounters.RASMissBubble.id) := true.B
+        }
+        io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.RASMissBubble.id) := true.B
+      }
+    } .elsewhen (fromFtq.topdown_redirect.bits.debugIsMemVio) {
+      for (i <- 0 until numOfStage) {
+        topdown_stages(i).reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
+      }
+      io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
+    } .otherwise {
+      for (i <- 0 until numOfStage) {
+        topdown_stages(i).reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
+      }
+      io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
+    }
+  }
+  
  class TlbExept(implicit p: Parameters) extends XSBundle{
    val pageFault = Bool()
    val accessFault = Bool()
@@ -180,6 +248,16 @@ class NewIFU(implicit p: Parameters) extends XSModule

  fromFtq.req.ready := f1_ready && io.icacheInter.icacheReady

+
+  when (wb_redirect) {
+    when (f3_wb_not_flush) {
+      topdown_stages(2).reasons(TopDownCounters.BTBMissBubble.id) := true.B
+    }
+    for (i <- 0 until numOfStage - 1) {
+      topdown_stages(i).reasons(TopDownCounters.BTBMissBubble.id) := true.B
+    }
+  }
+
  /** <PERF> f0 fetch bubble */

  XSPerfAccumulate("fetch_bubble_ftq_not_valid",   !fromFtq.req.valid && fromFtq.req.ready  )
@@ -247,6 +325,9 @@ class NewIFU(implicit p: Parameters) extends XSModule

  icacheRespAllValid := f2_icache_all_resp_reg || f2_icache_all_resp_wire

+  icacheMissBubble := io.icacheInter.topdownIcacheMiss
+  itlbMissBubble   := io.icacheInter.topdownItlbMiss
+
  io.icacheStop := !f3_ready

  when(f2_flush)                                              {f2_icache_all_resp_reg := false.B}
@@ -389,7 +470,7 @@ class NewIFU(implicit p: Parameters) extends XSModule
  val f3_af_vec         = RegEnable(next = f2_af_vec,      enable = f2_fire)
  val f3_pf_vec         = RegEnable(next = f2_pf_vec ,     enable = f2_fire)
  val f3_pc             = RegEnable(next = f2_pc,          enable = f2_fire)
-  val f3_half_snpc        = RegEnable(next = f2_half_snpc, enable = f2_fire)
+  val f3_half_snpc      = RegEnable(next = f2_half_snpc,   enable = f2_fire)
  val f3_instr_range    = RegEnable(next = f2_instr_range, enable = f2_fire)
  val f3_foldpc         = RegEnable(next = f2_foldpc,      enable = f2_fire)
  val f3_crossPageFault = RegEnable(next = f2_crossPageFault,      enable = f2_fire)
@@ -582,7 +663,7 @@ class NewIFU(implicit p: Parameters) extends XSModule
    !f3_pd(idx).isRVC && checkerOutStage1.fixedRange(idx) && f3_instr_valid(idx) && !checkerOutStage1.fixedTaken(idx) && ! f3_req_is_mmio
  }

-  val f3_last_validIdx             = ~ParallelPriorityEncoder(checkerOutStage1.fixedRange.reverse)
+  val f3_last_validIdx       = ParallelPosteriorityEncoder(checkerOutStage1.fixedRange)

  val f3_hasLastHalf         = hasLastHalf((PredictWidth - 1).U)
  val f3_false_lastHalf      = hasLastHalf(f3_last_validIdx)
@@ -746,6 +827,8 @@ class NewIFU(implicit p: Parameters) extends XSModule
  }

  val checkFlushWb = Wire(Valid(new PredecodeWritebackBundle))
+  val checkFlushWbjalTargetIdx = ParallelPriorityEncoder(VecInit(wb_pd.zip(wb_instr_valid).map{case (pd, v) => v && pd.isJal }))
+  val checkFlushWbTargetIdx = ParallelPriorityEncoder(wb_check_result_stage2.fixedMissPred)
  checkFlushWb.valid                  := wb_valid
  checkFlushWb.bits.pc                := wb_pc
  checkFlushWb.bits.pd                := wb_pd
@@ -756,8 +839,8 @@ class NewIFU(implicit p: Parameters) extends XSModule
  checkFlushWb.bits.misOffset.bits    := Mux(wb_half_flush, wb_lastIdx, ParallelPriorityEncoder(wb_check_result_stage2.fixedMissPred))
  checkFlushWb.bits.cfiOffset.valid   := ParallelOR(wb_check_result_stage1.fixedTaken)
  checkFlushWb.bits.cfiOffset.bits    := ParallelPriorityEncoder(wb_check_result_stage1.fixedTaken)
-  checkFlushWb.bits.target            := Mux(wb_half_flush, wb_half_target, wb_check_result_stage2.fixedTarget(ParallelPriorityEncoder(wb_check_result_stage2.fixedMissPred)))
-  checkFlushWb.bits.jalTarget         := wb_check_result_stage2.fixedTarget(ParallelPriorityEncoder(VecInit(wb_pd.zip(wb_instr_valid).map{case (pd, v) => v && pd.isJal })))
+  checkFlushWb.bits.target            := Mux(wb_half_flush, wb_half_target, wb_check_result_stage2.fixedTarget(checkFlushWbTargetIdx))
+  checkFlushWb.bits.jalTarget         := wb_check_result_stage2.fixedTarget(checkFlushWbjalTargetIdx)
  checkFlushWb.bits.instrRange        := wb_instr_range.asTypeOf(Vec(PredictWidth, Bool()))

  toFtq.pdWb := Mux(wb_valid, checkFlushWb,  mmioFlushWb)
@@ -827,6 +910,8 @@ class NewIFU(implicit p: Parameters) extends XSModule
  XSPerfAccumulate("except_0",   f3_perf_info.except_0 && io.toIbuffer.fire() )
  XSPerfHistogram("ifu2ibuffer_validCnt", PopCount(io.toIbuffer.bits.valid & io.toIbuffer.bits.enqEnable), io.toIbuffer.fire, 0, PredictWidth + 1, 1)

+  val isWriteFetchToIBufferTable = WireInit(Constantin.createRecord("isWriteFetchToIBufferTable" + p(XSCoreParamsKey).HartId.toString))
+  val isWriteIfuWbToFtqTable = WireInit(Constantin.createRecord("isWriteIfuWbToFtqTable" + p(XSCoreParamsKey).HartId.toString))
  val fetchToIBufferTable = ChiselDB.createTable("FetchToIBuffer" + p(XSCoreParamsKey).HartId.toString, new FetchToIBufferDB)
  val ifuWbToFtqTable = ChiselDB.createTable("IfuWbToFtq" + p(XSCoreParamsKey).HartId.toString, new IfuWbToFtqDB)

@@ -848,14 +933,14 @@ class NewIFU(implicit p: Parameters) extends XSModule

  fetchToIBufferTable.log(
    data = fetchIBufferDumpData,
-    en = io.toIbuffer.fire(),
+    en = isWriteFetchToIBufferTable.orR && io.toIbuffer.fire,
    site = "IFU" + p(XSCoreParamsKey).HartId.toString,
    clock = clock,
    reset = reset
  )
  ifuWbToFtqTable.log(
    data = ifuWbToFtqDumpData,
-    en = checkFlushWb.valid,
+    en = isWriteIfuWbToFtqTable.orR && checkFlushWb.valid,
    site = "IFU" + p(XSCoreParamsKey).HartId.toString,
    clock = clock,
    reset = reset

--- a/src/main/scala/xiangshan/frontend/ITTAGE.scala
+++ b/src/main/scala/xiangshan/frontend/ITTAGE.scala
@@ -273,11 +273,10 @@ class ITTageTable
  us.io.waddr := update_idx
  us.io.wdata := io.update.u

-  val wrbypass = Module(new WrBypass(UInt(ITTageCtrBits.W), wrBypassEntries, log2Ceil(nRows), tagWidth=tagLen))
+  val wrbypass = Module(new WrBypass(UInt(ITTageCtrBits.W), wrBypassEntries, log2Ceil(nRows)))

  wrbypass.io.wen := io.update.valid
  wrbypass.io.write_idx := update_idx
-  wrbypass.io.write_tag.map(_ := update_tag)
  wrbypass.io.write_data.map(_ := update_wdata.ctr)

  val old_ctr = Mux(wrbypass.io.hit, wrbypass.io.hit_data(0).bits, io.update.oldCtr)
@@ -420,7 +419,7 @@ class ITTage(implicit p: Parameters) extends BaseITTage {
  val update = io.update.bits
  val updateValid =
    update.is_jalr && !update.is_ret && u_valid && update.ftb_entry.jmpValid &&
-    update.jmp_taken
+    update.jmp_taken && update.cfi_idx.valid && update.cfi_idx.bits === update.ftb_entry.tailSlot.offset
  val updateFhist = update.spec_info.folded_hist

  // meta is splited by composer
@@ -470,12 +469,12 @@ class ITTage(implicit p: Parameters) extends BaseITTage {
  s2_tageTaken := Mux1H(Seq(
    (provided && !providerNull, providerInfo.ctr(ITTageCtrBits-1)),
    (altProvided && providerNull, altProviderInfo.ctr(ITTageCtrBits-1)),
-    (!provided, basePred)
+    (!provided || providerNull && !altProvided, basePred)
  )) // TODO: reintroduce BIM
  s2_tageTarget := Mux1H(Seq(
    (provided && !providerNull, providerInfo.target),
    (altProvided && providerNull, altProviderInfo.target),
-    (!provided, baseTarget)
+    (!provided || providerNull && !altProvided, baseTarget)
  ))
  s2_finalAltPred := Mux(altProvided, altProviderInfo.ctr(ITTageCtrBits-1), basePred)
  s2_provided       := provided

--- a/src/main/scala/xiangshan/frontend/Ibuffer.scala
+++ b/src/main/scala/xiangshan/frontend/Ibuffer.scala
@@ -31,9 +31,17 @@ class IbufPtr(implicit p: Parameters) extends CircularQueuePtr[IbufPtr](

 class IBufferIO(implicit p: Parameters) extends XSBundle {
  val flush = Input(Bool())
+  val ControlRedirect = Input(Bool())
+  val ControlBTBMissBubble = Input(Bool())
+  val TAGEMissBubble = Input(Bool())
+  val SCMissBubble = Input(Bool())
+  val ITTAGEMissBubble = Input(Bool())
+  val RASMissBubble = Input(Bool())
+  val MemVioRedirect = Input(Bool())
  val in = Flipped(DecoupledIO(new FetchToIBuffer))
  val out = Vec(DecodeWidth, DecoupledIO(new CtrlFlow))
  val full = Output(Bool())
+  val stallReason = new StallReasonIO(DecodeWidth)
 }

 class IBufEntry(implicit p: Parameters) extends XSBundle {
@@ -89,6 +97,38 @@ class IBufEntry(implicit p: Parameters) extends XSBundle {

 class Ibuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper with HasPerfEvents {
  val io = IO(new IBufferIO)
+  dontTouch(io.stallReason)
+
+  val topdown_stage = RegInit(0.U.asTypeOf(new FrontendTopDownBundle))
+  dontTouch(topdown_stage)
+  topdown_stage := io.in.bits.topdown_info
+  when (io.flush) {
+    when (io.ControlRedirect) {
+      when (io.ControlBTBMissBubble) {
+        topdown_stage.reasons(TopDownCounters.BTBMissBubble.id) := true.B
+      } .elsewhen (io.TAGEMissBubble) {
+        topdown_stage.reasons(TopDownCounters.TAGEMissBubble.id) := true.B
+      } .elsewhen (io.SCMissBubble) {
+        topdown_stage.reasons(TopDownCounters.SCMissBubble.id) := true.B
+      } .elsewhen (io.ITTAGEMissBubble) {
+        topdown_stage.reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
+      } .elsewhen (io.RASMissBubble) {
+        topdown_stage.reasons(TopDownCounters.RASMissBubble.id) := true.B
+      }
+    } .elsewhen (io.MemVioRedirect) {
+      topdown_stage.reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
+    } .otherwise {
+      topdown_stage.reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
+    }
+  }
+  
+  
+  val dequeueInsufficient = Wire(Bool())
+  val matchBubble = Wire(UInt(log2Up(TopDownCounters.NumStallReasons.id).W))
+
+  matchBubble := (TopDownCounters.NumStallReasons.id - 1).U - PriorityEncoder(topdown_stage.reasons.reverse)
+  dontTouch(matchBubble)
+  val matchBubbleVec = WireInit(VecInit(topdown_stage.reasons.zipWithIndex.map{case (r, i) => matchBubble === i.U}))

  val ibuf = Module(new SyncDataModuleTemplate(new IBufEntry, IBufSize, 2 * DecodeWidth, PredictWidth))

@@ -132,6 +172,32 @@ class Ibuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrH
    ((1 << DecodeWidth) - 1).U,
    UIntToMask(validEntries(log2Ceil(DecodeWidth) - 1, 0), DecodeWidth)
  )
+  val deqValidCount = PopCount(validVec.asBools)
+  val deqWasteCount = DecodeWidth.U - deqValidCount
+  dequeueInsufficient := deqValidCount < DecodeWidth.U
+
+  io.stallReason.reason.map(_ := 0.U)
+  for (i <- 0 until DecodeWidth) {
+    when (i.U < deqWasteCount) {
+      io.stallReason.reason(DecodeWidth - i - 1) := matchBubble
+    }
+  }
+
+  when (!(deqWasteCount === DecodeWidth.U || topdown_stage.reasons.asUInt.orR)) {
+        // should set reason for FetchFragmentationStall
+        // topdown_stage.reasons(TopDownCounters.FetchFragmentationStall.id) := true.B
+        for (i <- 0 until DecodeWidth) {
+          when (i.U < deqWasteCount) {
+            io.stallReason.reason(DecodeWidth - i - 1) := TopDownCounters.FetchFragBubble.id.U
+          }
+        }
+  }
+
+  when (io.stallReason.backReason.valid) {
+    io.stallReason.reason.map(_ := io.stallReason.backReason.bits)
+  }
+
+
  val deqData = Reg(Vec(DecodeWidth, new IBufEntry))
  for (i <- 0 until DecodeWidth) {
    io.out(i).valid := validVec(i)
@@ -203,10 +269,26 @@ class Ibuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrH
  QueuePerf(IBufSize, validEntries, !allowEnq)
  XSPerfAccumulate("flush", io.flush)
  XSPerfAccumulate("hungry", instrHungry)
-  if (env.EnableTopDown) {
-    val ibuffer_IDWidth_hvButNotFull = afterInit && (validEntries =/= 0.U) && (validEntries < DecodeWidth.U) && !headBubble
-    XSPerfAccumulate("ibuffer_IDWidth_hvButNotFull", ibuffer_IDWidth_hvButNotFull)
-  }
+
+  val ibuffer_IDWidth_hvButNotFull = afterInit && (validEntries =/= 0.U) && (validEntries < DecodeWidth.U) && !headBubble
+  XSPerfAccumulate("ibuffer_IDWidth_hvButNotFull", ibuffer_IDWidth_hvButNotFull)
+  /*
+  XSPerfAccumulate("ICacheMissBubble", Mux(matchBubbleVec(TopDownCounters.ICacheMissBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("ITLBMissBubble", Mux(matchBubbleVec(TopDownCounters.ITLBMissBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("ControlRedirectBubble", Mux(matchBubbleVec(TopDownCounters.ControlRedirectBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("MemVioRedirectBubble", Mux(matchBubbleVec(TopDownCounters.MemVioRedirectBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("OtherRedirectBubble", Mux(matchBubbleVec(TopDownCounters.OtherRedirectBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("BTBMissBubble", Mux(matchBubbleVec(TopDownCounters.BTBMissBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("OverrideBubble", Mux(matchBubbleVec(TopDownCounters.OverrideBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("FtqUpdateBubble", Mux(matchBubbleVec(TopDownCounters.FtqUpdateBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("FtqFullStall", Mux(matchBubbleVec(TopDownCounters.FtqFullStall.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("FetchFragmentBubble", 
+  Mux(deqWasteCount === DecodeWidth.U || topdown_stage.reasons.asUInt.orR, 0.U, deqWasteCount))
+  XSPerfAccumulate("TAGEMissBubble", Mux(matchBubbleVec(TopDownCounters.TAGEMissBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("SCMissBubble", Mux(matchBubbleVec(TopDownCounters.SCMissBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("ITTAGEMissBubble", Mux(matchBubbleVec(TopDownCounters.ITTAGEMissBubble.id), deqWasteCount, 0.U))
+  XSPerfAccumulate("RASMissBubble", Mux(matchBubbleVec(TopDownCounters.RASMissBubble.id), deqWasteCount, 0.U))
+  */

  val perfEvents = Seq(
    ("IBuffer_Flushed  ", io.flush                                                                     ),

--- a/src/main/scala/xiangshan/frontend/NewFtq.scala
+++ b/src/main/scala/xiangshan/frontend/NewFtq.scala
--- a/src/main/scala/xiangshan/frontend/SC.scala
+++ b/src/main/scala/xiangshan/frontend/SC.scala
@@ -291,6 +291,11 @@ trait HasSC extends HasSCParameter with HasPerfEvents { this: Tage =>
          s2_tageTakens(w)
        )

+      val s3_disagree = RegEnable(s2_disagree, io.s2_fire)
+      // FIXME: not portable
+      io.out.last_stage_ftb_entry.brSlots(0).sc := RegEnable(s2_disagree(0), io.s2_fire)
+      io.out.last_stage_ftb_entry.tailSlot.sc := RegEnable(s2_disagree(1), io.s2_fire)
+
      scMeta.tageTakens(w) := RegEnable(s2_tageTakens(w), io.s2_fire)
      scMeta.scUsed(w)     := RegEnable(s2_provideds(w), io.s2_fire)
      scMeta.scPreds(w)    := RegEnable(s2_scPreds(s2_chooseBit), io.s2_fire)

--- a/src/main/scala/xiangshan/frontend/Tage.scala
+++ b/src/main/scala/xiangshan/frontend/Tage.scala
@@ -420,7 +420,7 @@ class TageTable
  }

  val bank_wrbypasses = Seq.fill(nBanks)(Seq.fill(numBr)(
-    Module(new WrBypass(UInt(TageCtrBits.W), perBankWrbypassEntries, 1, tagWidth=tagLen))
+    Module(new WrBypass(UInt(TageCtrBits.W), perBankWrbypassEntries, log2Ceil(bankSize)))
  )) // let it corresponds to logical brIdx

  for (b <- 0 until nBanks) {
@@ -456,7 +456,6 @@ class TageTable
      val br_pidx = get_phy_br_idx(update_unhashed_idx, li)
      wrbypass.io.wen := io.update.mask(li) && update_req_bank_1h(b)
      wrbypass.io.write_idx := get_bank_idx(update_idx)
-      wrbypass.io.write_tag.map(_ := update_tag)
      wrbypass.io.write_data(0) := Mux1H(UIntToOH(br_pidx, numBr), per_bank_update_wdata(b)).ctr
    }
  }
@@ -677,12 +676,12 @@ class Tage(implicit p: Parameters) extends BaseTage {
    resp_meta.allocates(i) := RegEnable(allocatableSlots, io.s2_fire)

    val s1_bimCtr = bt.io.s1_cnt(i)
+    s1_altUsed(i)       := !provided || providerInfo.use_alt_on_unconf
    s1_tageTakens(i) := 
-      Mux(!provided || providerInfo.use_alt_on_unconf,
+      Mux(s1_altUsed(i),
        s1_bimCtr(1),
        providerInfo.resp.ctr(TageCtrBits-1)
      )
-    s1_altUsed(i)       := !provided || providerInfo.use_alt_on_unconf
    s1_finalAltPreds(i) := s1_bimCtr(1)
    s1_basecnts(i)      := s1_bimCtr
    s1_useAltOnNa(i)    := providerInfo.use_alt_on_unconf

--- a/src/main/scala/xiangshan/frontend/WrBypass.scala
+++ b/src/main/scala/xiangshan/frontend/WrBypass.scala
--- a/src/main/scala/xiangshan/frontend/icache/ICache.scala
+++ b/src/main/scala/xiangshan/frontend/icache/ICache.scala
--- a/src/main/scala/xiangshan/frontend/icache/ICacheBankedArray.scala
+++ b/src/main/scala/xiangshan/frontend/icache/ICacheBankedArray.scala
--- a/src/main/scala/xiangshan/frontend/icache/ICacheBundle.scala
+++ b/src/main/scala/xiangshan/frontend/icache/ICacheBundle.scala
--- a/src/main/scala/xiangshan/frontend/icache/ICacheMainPipe.scala
+++ b/src/main/scala/xiangshan/frontend/icache/ICacheMainPipe.scala
--- a/src/main/scala/xiangshan/frontend/icache/ICacheMissUnit.scala
+++ b/src/main/scala/xiangshan/frontend/icache/ICacheMissUnit.scala
--- a/src/main/scala/xiangshan/frontend/icache/ICacheProbeUnit.scala
+++ b/src/main/scala/xiangshan/frontend/icache/ICacheProbeUnit.scala
--- a/src/main/scala/xiangshan/frontend/icache/IPrefetch.scala
+++ b/src/main/scala/xiangshan/frontend/icache/IPrefetch.scala
--- a/src/main/scala/xiangshan/frontend/icache/ReleaseUnit.scala
+++ b/src/main/scala/xiangshan/frontend/icache/ReleaseUnit.scala
--- a/src/main/scala/xiangshan/frontend/icache/ReplacePipe.scala
+++ b/src/main/scala/xiangshan/frontend/icache/ReplacePipe.scala
--- a/src/main/scala/xiangshan/mem/MemCommon.scala
+++ b/src/main/scala/xiangshan/mem/MemCommon.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/FreeList.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/FreeList.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueRAR.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueRAR.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueRAW.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueRAW.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/UncacheBuffer.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/UncacheBuffer.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala
--- a/src/main/scala/xiangshan/mem/mdp/StoreSet.scala
+++ b/src/main/scala/xiangshan/mem/mdp/StoreSet.scala
--- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
--- a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala
--- a/src/main/scala/xiangshan/mem/sbuffer/FakeSbuffer.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/FakeSbuffer.scala
--- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
--- a/src/main/scala/xiangshan/package.scala
+++ b/src/main/scala/xiangshan/package.scala
--- a/src/test/scala/top/SimTop.scala
+++ b/src/test/scala/top/SimTop.scala
--- a/src/test/scala/xiangshan/DecodeTest.scala
+++ b/src/test/scala/xiangshan/DecodeTest.scala
@@ -18,7 +18,7 @@ import xiangshan.backend.decode.DecodeUnit

 object DecodeMain extends App with HasRocketChipStageUtils {
  override def main(args: Array[String]): Unit = {
-    val (config, firrtlOpts, firrtlComplier) = ArgParser.parse(args)
+    val (config, firrtlOpts, firrtlComplier, firtoolOpts) = ArgParser.parse(args)
    // //val soc = DisableMonitors(p => LazyModule(new XSTop()(p)))(config)
    // If Complex Params are needed, wrap it with a Top Module to do dirty works,
    // and use "chisel3.aop.Select.collectDeep[ModuleWanted](WrapperModule){case a: ModuleWanted => a}.head.Params"

--- a/utility @ 3d812fec
+++ b/utility @ 3d812fec
-Subproject commit c83eac5e93a94b514f7aca26f1c58e3934471d3b
+Subproject commit 3d812fec9936ccd584df7721aa8c2d02e932d325