mem: disable l2l forward by default (#1283)

64886eef · William Wang · GitHub · 9d4e1137 · 64886eef · 64886eef
3 changed file
--- a/src/main/scala/xiangshan/Parameters.scala
+++ b/src/main/scala/xiangshan/Parameters.scala
@@ -161,6 +161,7 @@ case class XSCoreParameters
  StorePipelineWidth: Int = 2,
  StoreBufferSize: Int = 16,
  StoreBufferThreshold: Int = 7,
+  EnableLoadToLoadForward: Boolean = false,
  EnableFastForward: Boolean = false,
  EnableLdVioCheckAfterReset: Boolean = true,
  RefillSize: Int = 512,
@@ -377,6 +378,7 @@ trait HasXSParameter {
  val StorePipelineWidth = coreParams.StorePipelineWidth
  val StoreBufferSize = coreParams.StoreBufferSize
  val StoreBufferThreshold = coreParams.StoreBufferThreshold
+  val EnableLoadToLoadForward = coreParams.EnableLoadToLoadForward
  val EnableFastForward = coreParams.EnableFastForward
  val EnableLdVioCheckAfterReset = coreParams.EnableLdVioCheckAfterReset
  val RefillSize = coreParams.RefillSize

--- a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala
+++ b/src/main/scala/xiangshan/backend/issue/ReservationStation.scala
@@ -577,34 +577,38 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
      // we reduce its latency for one cycle since it does not need to read
      // from data array. Timing to be optimized later.
      if (params.isLoad) {
-        val ldFastDeq = Wire(io.deq(i).cloneType)
-        // Condition: wakeup by load (to select load wakeup bits)
-        val ldCanBeFast = VecInit(
-          wakeupBypassMask.drop(exuParameters.AluCnt).take(exuParameters.LduCnt).map(_.asUInt.orR)
-        ).asUInt
-        ldFastDeq.valid := issueVec(i).valid && ldCanBeFast.orR
-        ldFastDeq.ready := true.B
-        ldFastDeq.bits.src := DontCare
-        ldFastDeq.bits.uop := s1_out(i).bits.uop
-        // when last cycle load has fast issue, cancel this cycle's normal issue and let it go
-        val lastCycleLdFire = RegNext(ldFastDeq.valid && !s2_deq(i).valid && io.deq(i).ready)
-        when (lastCycleLdFire) {
-          s2_deq(i).valid := false.B
-          s2_deq(i).ready := true.B
-        }
-        // For now, we assume deq.valid has higher priority than ldFastDeq.
-        when (!s2_deq(i).valid) {
-          io.deq(i).valid := ldFastDeq.valid
-          io.deq(i).bits := ldFastDeq.bits
-          s2_deq(i).ready := true.B
-        }
-        io.load.get.fastMatch(i) := Mux(s2_deq(i).valid, 0.U, ldCanBeFast)
-        when (!s2_deq(i).valid) {
-          io.feedback.get(i).rsIdx := s1_issue_index(i)
-          io.feedback.get(i).isFirstIssue := s1_first_issue(i)
+        if (EnableLoadToLoadForward) {
+          val ldFastDeq = Wire(io.deq(i).cloneType)
+          // Condition: wakeup by load (to select load wakeup bits)
+          val ldCanBeFast = VecInit(
+            wakeupBypassMask.drop(exuParameters.AluCnt).take(exuParameters.LduCnt).map(_.asUInt.orR)
+          ).asUInt
+          ldFastDeq.valid := issueVec(i).valid && ldCanBeFast.orR
+          ldFastDeq.ready := true.B
+          ldFastDeq.bits.src := DontCare
+          ldFastDeq.bits.uop := s1_out(i).bits.uop
+          // when last cycle load has fast issue, cancel this cycle's normal issue and let it go
+          val lastCycleLdFire = RegNext(ldFastDeq.valid && !s2_deq(i).valid && io.deq(i).ready)
+          when (lastCycleLdFire) {
+            s2_deq(i).valid := false.B
+            s2_deq(i).ready := true.B
+          }
+          // For now, we assume deq.valid has higher priority than ldFastDeq.
+          when (!s2_deq(i).valid) {
+            io.deq(i).valid := ldFastDeq.valid
+            io.deq(i).bits := ldFastDeq.bits
+            s2_deq(i).ready := true.B
+          }
+          io.load.get.fastMatch(i) := Mux(s2_deq(i).valid, 0.U, ldCanBeFast)
+          when (!s2_deq(i).valid) {
+            io.feedback.get(i).rsIdx := s1_issue_index(i)
+            io.feedback.get(i).isFirstIssue := s1_first_issue(i)
+          }
+          XSPerfAccumulate(s"fast_load_deq_valid_$i", !s2_deq(i).valid && ldFastDeq.valid)
+          XSPerfAccumulate(s"fast_load_deq_fire_$i", !s2_deq(i).valid && ldFastDeq.valid && io.deq(i).ready)
+        } else {
+          io.load.get.fastMatch(i) := DontCare
        }
-        XSPerfAccumulate(s"fast_load_deq_valid_$i", !s2_deq(i).valid && ldFastDeq.valid)
-        XSPerfAccumulate(s"fast_load_deq_fire_$i", !s2_deq(i).valid && ldFastDeq.valid && io.deq(i).ready)
      }

      io.deq(i).bits.uop.debugInfo.issueTime := GTimer()

--- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
@@ -59,24 +59,29 @@ class LoadUnit_S0(implicit p: Parameters) extends XSModule with HasDCacheParamet
  val s0_uop = io.in.bits.uop
  val imm12 = WireInit(s0_uop.ctrl.imm(11,0))

-  // slow vaddr from non-load insts
-  val slowpath_vaddr = io.in.bits.src(0) + SignExt(s0_uop.ctrl.imm(11,0), VAddrBits)
-  val slowpath_mask = genWmask(slowpath_vaddr, s0_uop.ctrl.fuOpType(1,0))
-
-  // fast vaddr from load insts
-  val fastpath_vaddrs = WireInit(VecInit(List.tabulate(LoadPipelineWidth)(i => {
-     io.fastpath(i).data + SignExt(s0_uop.ctrl.imm(11,0), VAddrBits)
-  })))
-  val fastpath_masks = WireInit(VecInit(List.tabulate(LoadPipelineWidth)(i => {
-     genWmask(fastpath_vaddrs(i), s0_uop.ctrl.fuOpType(1,0))
-  })))
-  val fastpath_vaddr = Mux1H(io.loadFastMatch, fastpath_vaddrs)
-  val fastpath_mask  = Mux1H(io.loadFastMatch, fastpath_masks)
-
-  // select vaddr from 2 alus
-  val s0_vaddr = Mux(io.loadFastMatch.orR, fastpath_vaddr, slowpath_vaddr)
-  val s0_mask  = Mux(io.loadFastMatch.orR, fastpath_mask, slowpath_mask)
-  XSPerfAccumulate("load_to_load_forward", io.loadFastMatch.orR && io.in.fire())
+  val s0_vaddr = WireInit(io.in.bits.src(0) + SignExt(s0_uop.ctrl.imm(11,0), VAddrBits))
+  val s0_mask = WireInit(genWmask(s0_vaddr, s0_uop.ctrl.fuOpType(1,0)))
+
+  if (EnableLoadToLoadForward) {
+    // slow vaddr from non-load insts
+    val slowpath_vaddr = io.in.bits.src(0) + SignExt(s0_uop.ctrl.imm(11,0), VAddrBits)
+    val slowpath_mask = genWmask(slowpath_vaddr, s0_uop.ctrl.fuOpType(1,0))
+
+    // fast vaddr from load insts
+    val fastpath_vaddrs = WireInit(VecInit(List.tabulate(LoadPipelineWidth)(i => {
+      io.fastpath(i).data + SignExt(s0_uop.ctrl.imm(11,0), VAddrBits)
+    })))
+    val fastpath_masks = WireInit(VecInit(List.tabulate(LoadPipelineWidth)(i => {
+      genWmask(fastpath_vaddrs(i), s0_uop.ctrl.fuOpType(1,0))
+    })))
+    val fastpath_vaddr = Mux1H(io.loadFastMatch, fastpath_vaddrs)
+    val fastpath_mask  = Mux1H(io.loadFastMatch, fastpath_masks)
+
+    // select vaddr from 2 alus
+    s0_vaddr := Mux(io.loadFastMatch.orR, fastpath_vaddr, slowpath_vaddr)
+    s0_mask  := Mux(io.loadFastMatch.orR, fastpath_mask, slowpath_mask)
+    XSPerfAccumulate("load_to_load_forward", io.loadFastMatch.orR && io.in.fire())
+  }

  val isSoftPrefetch = LSUOpType.isPrefetch(s0_uop.ctrl.fuOpType)
  val isSoftPrefetchRead = s0_uop.ctrl.fuOpType === LSUOpType.prefetch_r