diff --git a/src/main/scala/xiangshan/Bundle.scala b/src/main/scala/xiangshan/Bundle.scala index b726912c4fbad12d671ecef21fdc3ec967873364..74f750a95843e94bbef2c87f6a7553f72f27a855 100644 --- a/src/main/scala/xiangshan/Bundle.scala +++ b/src/main/scala/xiangshan/Bundle.scala @@ -51,6 +51,14 @@ object ValidUndirectioned { } } +object RSFeedbackType { + val tlbMiss = 0.U(2.W) + val mshrFull = 1.U(2.W) + val dataInvalid = 2.U(2.W) + + def apply() = UInt(2.W) +} + class SCMeta(val useSC: Boolean)(implicit p: Parameters) extends XSBundle with HasSCParameter { val tageTaken = if (useSC) Bool() else UInt(0.W) val scUsed = if (useSC) Bool() else UInt(0.W) @@ -407,14 +415,13 @@ class RoqCommitIO(implicit p: Parameters) extends XSBundle { def hasCommitInstr = !isWalk && valid.asUInt.orR } -class TlbFeedback(implicit p: Parameters) extends XSBundle { +class RSFeedback(implicit p: Parameters) extends XSBundle { val rsIdx = UInt(log2Up(IssQueSize).W) val hit = Bool() val flushState = Bool() + val sourceType = RSFeedbackType() } -class RSFeedback(implicit p: Parameters) extends TlbFeedback - class FrontendToBackendIO(implicit p: Parameters) extends XSBundle { // to backend end val cfVec = Vec(DecodeWidth, DecoupledIO(new CtrlFlow)) diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index 0e082d7029095d1a1217c585615f6bfcab9a134a..84d3e441bdc08aae000f7c42ffc3d2e8c9269328 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -101,6 +101,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) val storeUnits = Seq.fill(exuParameters.StuCnt)(Module(new StoreUnit)) val exeUnits = loadUnits ++ storeUnits + loadUnits.zipWithIndex.map(x => x._1.suggestName("LoadUnit_"+x._2)) + storeUnits.zipWithIndex.map(x => x._1.suggestName("StoreUnit_"+x._2)) + val atomicsUnit = Module(new AtomicsUnit) val loadWritebackOverride = Mux(atomicsUnit.io.out.valid, atomicsUnit.io.out.bits, loadUnits.head.io.ldout.bits) @@ -221,7 +224,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) for (i <- 0 until exuParameters.LduCnt) { loadUnits(i).io.redirect <> io.fromCtrlBlock.redirect loadUnits(i).io.flush <> io.fromCtrlBlock.flush - loadUnits(i).io.tlbFeedback <> reservationStations(i).io.memfeedback + loadUnits(i).io.rsFeedback <> reservationStations(i).io.memfeedback loadUnits(i).io.rsIdx := reservationStations(i).io.rsIdx // TODO: beautify it loadUnits(i).io.isFirstIssue := reservationStations(i).io.isFirstIssue // NOTE: just for dtlb's perf cnt loadUnits(i).io.dtlb <> dtlb.io.requestor(i) @@ -255,13 +258,16 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) stu.io.redirect <> io.fromCtrlBlock.redirect stu.io.flush <> io.fromCtrlBlock.flush - stu.io.tlbFeedback <> rs.io.memfeedback + stu.io.rsFeedback <> rs.io.memfeedback stu.io.rsIdx <> rs.io.rsIdx stu.io.isFirstIssue <> rs.io.isFirstIssue // NOTE: just for dtlb's perf cnt stu.io.dtlb <> dtlbReq stu.io.stin <> rs.io.deq stu.io.lsq <> lsq.io.storeIn(i) + // rs.io.storeData <> lsq.io.storeDataIn(i) + lsq.io.storeDataIn(i) := rs.io.stData + // sync issue info to rs lsq.io.storeIssue(i).valid := rs.io.deq.valid lsq.io.storeIssue(i).bits := rs.io.deq.bits @@ -321,6 +327,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) val st0_atomics = reservationStations(atomic_rs0).io.deq.valid && FuType.storeIsAMO(reservationStations(atomic_rs0).io.deq.bits.uop.ctrl.fuType) val st1_atomics = reservationStations(atomic_rs1).io.deq.valid && FuType.storeIsAMO(reservationStations(atomic_rs1).io.deq.bits.uop.ctrl.fuType) + val st0_data_atomics = reservationStations(atomic_rs0).io.stData.valid && FuType.storeIsAMO(reservationStations(atomic_rs0).io.stData.bits.uop.ctrl.fuType) + val st1_data_atomics = reservationStations(atomic_rs1).io.stData.valid && FuType.storeIsAMO(reservationStations(atomic_rs1).io.stData.bits.uop.ctrl.fuType) + when (st0_atomics) { reservationStations(atomic_rs0).io.deq.ready := atomicsUnit.io.in.ready storeUnits(0).io.stin.valid := false.B @@ -342,6 +351,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) atomicsUnit.io.in.valid := st0_atomics || st1_atomics atomicsUnit.io.in.bits := Mux(st0_atomics, reservationStations(atomic_rs0).io.deq.bits, reservationStations(atomic_rs1).io.deq.bits) + atomicsUnit.io.storeDataIn.valid := st0_data_atomics || st1_data_atomics + atomicsUnit.io.storeDataIn.bits := Mux(st0_data_atomics, reservationStations(atomic_rs0).io.stData.bits, reservationStations(atomic_rs1).io.stData.bits) atomicsUnit.io.rsIdx := Mux(st0_atomics, reservationStations(atomic_rs0).io.rsIdx, reservationStations(atomic_rs1).io.rsIdx) atomicsUnit.io.redirect <> io.fromCtrlBlock.redirect atomicsUnit.io.flush <> io.fromCtrlBlock.flush @@ -366,14 +377,14 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) } when (state === s_atomics_0) { - atomicsUnit.io.tlbFeedback <> reservationStations(atomic_rs0).io.memfeedback + atomicsUnit.io.rsFeedback <> reservationStations(atomic_rs0).io.memfeedback - assert(!storeUnits(0).io.tlbFeedback.valid) + assert(!storeUnits(0).io.rsFeedback.valid) } when (state === s_atomics_1) { - atomicsUnit.io.tlbFeedback <> reservationStations(atomic_rs1).io.memfeedback + atomicsUnit.io.rsFeedback <> reservationStations(atomic_rs1).io.memfeedback - assert(!storeUnits(1).io.tlbFeedback.valid) + assert(!storeUnits(1).io.rsFeedback.valid) } lsq.io.exceptionAddr.lsIdx := io.lsqio.exceptionAddr.lsIdx diff --git a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala b/src/main/scala/xiangshan/backend/issue/ReservationStation.scala index adc0fa84e52e850447cd7f6d0f57be90f1b2514e..ee3b7f7f8eaaf8ad0ba6dbbdffb3e64d5fa0d313 100644 --- a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala +++ b/src/main/scala/xiangshan/backend/issue/ReservationStation.scala @@ -8,7 +8,7 @@ import utils._ import xiangshan.backend.decode.{ImmUnion, Imm_U} import xiangshan.backend.exu.{Exu, ExuConfig} import xiangshan.backend.roq.RoqPtr -import xiangshan.mem.SqPtr +import xiangshan.mem.{SqPtr, StoreDataBundle} import scala.math.max @@ -102,6 +102,7 @@ class ReservationStation val numExist = Output(UInt(iqIdxWidth.W)) val fromDispatch = Flipped(DecoupledIO(new MicroOp)) val deq = DecoupledIO(new ExuInput) + val stData = if (exuCfg == StExeUnitCfg) ValidIO(new StoreDataBundle) else null val srcRegValue = Input(Vec(srcNum, UInt(srcLen.W))) val stIssuePtr = if (exuCfg == LdExeUnitCfg) Input(new SqPtr()) else null @@ -143,6 +144,11 @@ class ReservationStation select.io.memfeedback := io.memfeedback select.io.flushState := io.memfeedback.bits.flushState } + if (exuCfg == StExeUnitCfg) { + select.io.dataReadyVec := ctrl.io.dataReadyVec + } else { + select.io.dataReadyVec := DontCare + } ctrl.io.in.valid := select.io.enq.ready && io.fromDispatch.valid // NOTE: ctrl doesnt care redirect for timing optimization ctrl.io.flush := io.flush @@ -162,6 +168,10 @@ class ReservationStation if (exuCfg == LdExeUnitCfg) { ctrl.io.stIssuePtr := RegNext(io.stIssuePtr) } + if (exuCfg == StExeUnitCfg) { + ctrl.io.selData.valid := select.io.deqData.valid + ctrl.io.selData.bits := select.io.deqData.bits + } data.io.in.valid := select.io.enq.fire() data.io.in.addr := select.io.enq.bits @@ -174,6 +184,7 @@ class ReservationStation } if (exuCfg == StExeUnitCfg) { data.io.fpRegValue := io.fpRegValue + data.io.selData := select.io.deqData.bits } data.io.sel := select.io.deq.bits data.io.listen.wen := ctrl.io.listen @@ -196,6 +207,12 @@ class ReservationStation if (srcNum > 1) { io.deq.bits.src2 := data.io.out(1) } if (srcNum > 2) { io.deq.bits.src3 := data.io.out(2) } if (exuCfg == JumpExeUnitCfg) { io.deq.bits.uop.cf.pc := data.io.pc } + + if (exuCfg == StExeUnitCfg) { + io.stData.bits.uop := ctrl.io.stData.bits + io.stData.bits.data := data.io.stData + io.stData.valid := ctrl.io.stData.valid + } } class ReservationStationSelect @@ -225,6 +242,7 @@ class ReservationStationSelect val redirectVec = Input(Vec(iqSize, Bool())) val readyVec = Input(Vec(iqSize, Bool())) + val dataReadyVec = Input(Vec(iqSize, Bool())) // NOTE: wanna dead code elimination eliminates the codes val validVec = Output(Vec(iqSize, Bool())) val indexVec = Output(Vec(iqSize, UInt(iqIdxWidth.W))) @@ -236,6 +254,7 @@ class ReservationStationSelect def fire() = valid && ready } val deq = DecoupledIO(UInt(iqIdxWidth.W)) + val deqData = if (exuCfg == StExeUnitCfg) ValidIO(UInt(iqIdxWidth.W)) else null val flushState = if (feedback) Input(Bool()) else null val isFirstIssue = if (feedback) Output(Bool()) else null @@ -251,7 +270,8 @@ class ReservationStationSelect * count queue : record replay cycle */ - val s_idle :: s_valid :: s_wait :: s_replay :: Nil = Enum(4) + val s_idle :: s_valid :: s_wait :: s_replay :: s_sent :: Nil = Enum(5) + val d_idle :: d_sent :: Nil = Enum(2) /* state machine * s_idle : empty slot, init state, set when deq * s_valid : ready to be secleted @@ -270,6 +290,11 @@ class ReservationStationSelect val emptyIdxQueue = widthMap(i => emptyQueue(indexQueue(i))) val countIdxQueue = widthMap(i => countQueue(indexQueue(i))) + // NOTE: wanna dead code elimination eliminates the below codes + val dataStateQueue = RegInit(VecInit(Seq.fill(iqSize)(d_idle))) + val dataValidQueue = VecInit(dataStateQueue.zip(stateQueue).map(a => a._1 === d_idle && a._2 =/= s_idle)) + val dataReadyIdxQueue = widthMap(i => dataValidQueue(indexQueue(i)) && io.dataReadyVec(indexQueue(i))) + // select ready // for no replay, select just equal to deq (attached) // with replay, select is just two stage with deq. @@ -305,6 +330,19 @@ class ReservationStationSelect (if(feedback) ~(0.U(iqSize.W)) else Mux(RegNext(selectValid && (io.redirect.valid || io.flush)), 0.U, ~(0.U(iqSize.W)))) + // store deq data, receiver(the sq) must be ready + // NOTE: wanna dead code elimination eliminates the below codes + val lastDataMask = Wire(UInt(iqSize.W)) + val dataMask = WireInit(VecInit((0 until iqSize).map(i => dataReadyIdxQueue(i)))).asUInt & lastDataMask + val dataIdx = ParallelPriorityMux(dataMask.asBools zip indexQueue) + val dataPtr = ParallelPriorityMux(dataMask.asBools.zipWithIndex.map{ case (a,i) => (a, i.U)}) // NOTE: the idx of indexQueue + val haveData = Cat(dataMask).orR + val dataIdxReg = RegNext(dataIdx, init = 0.U) + val dataValid = haveData + val dataReg = RegNext(dataValid, init = false.B) + val dataPtrReg = RegNext(Mux(moveMask(dataPtr), dataPtr-1.U, dataPtr), init = 0.U) + lastDataMask := ~Mux(dataReg, UIntToOH(dataPtrReg), 0.U) + // deq val dequeue = Mux(RegNext(io.flush), false.B, if (feedback) bubbleReg else bubbleReg || issueFire) @@ -327,11 +365,28 @@ class ReservationStationSelect if (feedback) { when (io.memfeedback.valid) { when (stateQueue(io.memfeedback.bits.rsIdx) === s_wait) { - stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_idle, s_replay) + val s_finish_state = if (exuCfg == StExeUnitCfg) { + Mux(dataStateQueue(io.memfeedback.bits.rsIdx) === d_sent || (dataReg && dataIdxReg === io.memfeedback.bits.rsIdx), + s_idle, s_sent) + } else { s_idle } + stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_finish_state, s_replay) } when (!io.memfeedback.bits.hit) { countQueue(io.memfeedback.bits.rsIdx) := replayDelay(cntCountQueue(io.memfeedback.bits.rsIdx)) } + assert(stateQueue(io.memfeedback.bits.rsIdx) === s_wait, "mem feedback but rs dont wait for it") + } + } + + if (exuCfg == StExeUnitCfg) { + when (dataReg) { + dataStateQueue(dataIdxReg) := d_sent + } + when (dataReg && stateQueue(dataIdxReg) === s_sent) { + stateQueue(dataIdxReg) := s_idle + } + for (i <- 0 until iqSize) { + assert(stateQueue(i) =/= s_sent || dataStateQueue(i) =/= d_sent, "dont want the state that addr and data both sent, but still not idle") } } @@ -383,6 +438,7 @@ class ReservationStationSelect val enqIdx = indexQueue(enqPtr) when (enqueue) { stateQueue(enqIdx) := s_valid + dataStateQueue(enqIdx) := d_idle cntCountQueue(enqIdx) := 0.U } @@ -394,6 +450,11 @@ class ReservationStationSelect io.deq.valid := selectValid io.deq.bits := selectIndex + if (exuCfg == StExeUnitCfg) { + io.deqData.valid := dataValid + io.deqData.bits := dataIdx + } + io.numExist := RegNext(Mux(nextTailPtr.flag, if(isPow2(iqSize)) (iqSize-1).U else iqSize.U, nextTailPtr.value), init = (iqSize - 1).U) assert(RegNext(Mux(tailPtr.flag, tailPtr.value===0.U, true.B))) @@ -463,10 +524,13 @@ class ReservationStationCtrl val uop = new MicroOp })) val sel = Flipped(ValidIO(UInt(iqIdxWidth.W))) + val selData = if (exuCfg == StExeUnitCfg) Flipped(ValidIO(UInt(iqIdxWidth.W))) else null val out = ValidIO(new MicroOp) + val stData = if (exuCfg == StExeUnitCfg) ValidIO(new MicroOp) else null val redirectVec = Output(Vec(iqSize, Bool())) val readyVec = Output(Vec(iqSize, Bool())) + val dataReadyVec = if (exuCfg == StExeUnitCfg) Output(Vec(IssQueSize, Bool())) else null val validVec = Input(Vec(iqSize, Bool())) val indexVec = Input(Vec(iqSize, UInt(iqIdxWidth.W))) @@ -486,7 +550,6 @@ class ReservationStationCtrl val enqEn = io.in.valid val enqEnReg = RegNext(enqEn && !(io.redirect.valid || io.flush), init = false.B) val enqUop = io.in.bits.uop - val enqUopReg = RegEnable(enqUop, selValid) val selPtr = io.sel.bits val selPtrReg = RegEnable(selPtr, selValid) val data = io.listen @@ -547,7 +610,12 @@ class ReservationStationCtrl } // load wait store - io.readyVec := srcQueueWire.map(Cat(_).andR) + if (exuCfg == StExeUnitCfg) { + io.readyVec := srcQueueWire.map(a => a(0)) + io.dataReadyVec := srcQueueWire.map(a => a(1)) + } else { + io.readyVec := srcQueueWire.map(Cat(_).andR) + } if (exuCfg == LdExeUnitCfg) { val ldWait = Reg(Vec(iqSize, Bool())) val sqIdx = Reg(Vec(iqSize, new SqPtr())) @@ -566,7 +634,7 @@ class ReservationStationCtrl } val redirectHit = io.redirectVec(selPtr) - val uop = Module(new SyncDataModuleTemplate(new MicroOp, iqSize, 1, 1)) + val uop = Module(new SyncDataModuleTemplate(new MicroOp, iqSize, if (exuCfg == StExeUnitCfg) 2 else 1, 1)) uop.io.raddr(0) := selPtr io.out.valid := RegNext(selValid && ~redirectHit) @@ -575,7 +643,14 @@ class ReservationStationCtrl uop.io.waddr(0) := enqPtr uop.io.wdata(0) := enqUop - class fastSendUop extends Bundle { + if (exuCfg == StExeUnitCfg) { // NOTE: send data part of st + uop.io.raddr(1) := io.selData.bits + io.stData.bits := uop.io.rdata(1) + io.stData.valid := RegNext(io.selData.valid && ~io.redirectVec(io.selData.bits)) + } + // NOTE: st dont fast wake others, dont care override + + class fastSendUop extends XSBundle { val pdest = UInt(PhyRegIdxWidth.W) val rfWen = Bool() val fpWen = Bool() @@ -595,6 +670,9 @@ class ReservationStationCtrl red := roq.needFlush(io.redirect, io.flush) } io.out.bits.roqIdx := roqIdx(selPtrReg) + if (exuCfg == StExeUnitCfg) { + io.stData.bits.roqIdx := roqIdx(RegEnable(io.selData.bits, io.selData.valid)) + } io.fastUopOut := DontCare if (fastWakeup) { @@ -790,7 +868,10 @@ class ReservationStationData } val sel = Input(UInt(iqIdxWidth.W)) + val selData = if(exuCfg == StExeUnitCfg) Input(UInt(iqIdxWidth.W)) else null val out = Output(Vec(srcNum, UInt(srcLen.W))) + val stData = if(exuCfg == StExeUnitCfg) Output(UInt(srcLen.W)) else null + val pc = if(exuCfg == JumpExeUnitCfg) Output(UInt(VAddrBits.W)) else null }) @@ -870,8 +951,18 @@ class ReservationStationData (0 until srcNum).foreach(i => data(i).w(0).wdata := io.srcRegValue(i) ) } // deq - data.map(_.r.addr := io.sel) + if (exuCfg == StExeUnitCfg) { + data(0).r.addr := io.sel + data(1).r.addr := io.selData + io.stData := data(1).r.rdata + } else { + data.map(_.r.addr := io.sel) + } + io.out := data.map(_.r.rdata) + if (exuCfg == StExeUnitCfg) { + io.out(1) := DontCare + } if(pcMem.nonEmpty){ pcMem.get.io.raddr(0) := io.sel io.pc := pcMem.get.io.rdata(0) diff --git a/src/main/scala/xiangshan/backend/roq/Roq.scala b/src/main/scala/xiangshan/backend/roq/Roq.scala index b23da74c8d53b2be2b2a5b63b8515b66b01d6ec9..e5ffa0110ed53440b05d6e42e92ffc7549af5a90 100644 --- a/src/main/scala/xiangshan/backend/roq/Roq.scala +++ b/src/main/scala/xiangshan/backend/roq/Roq.scala @@ -48,6 +48,7 @@ class RoqLsqIO(implicit p: Parameters) extends XSBundle { val pendingld = Output(Bool()) val pendingst = Output(Bool()) val commit = Output(Bool()) + val storeDataRoqWb = Input(Vec(StorePipelineWidth, Valid(new RoqPtr))) } class RoqEnqIO(implicit p: Parameters) extends XSBundle { @@ -266,6 +267,7 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc // writeback status // val writebacked = Reg(Vec(RoqSize, Bool())) val writebacked = Mem(RoqSize, Bool()) + val store_data_writebacked = Mem(RoqSize, Bool()) // data for redirect, exception, etc. // val flagBkup = RegInit(VecInit(List.fill(RoqSize)(false.B))) val flagBkup = Mem(RoqSize, Bool()) @@ -460,7 +462,8 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc io.commits.isWalk := state =/= s_idle val commit_v = Mux(state === s_idle, VecInit(deqPtrVec.map(ptr => valid(ptr.value))), VecInit(walkPtrVec.map(ptr => valid(ptr.value)))) - val commit_w = VecInit(deqPtrVec.map(ptr => writebacked(ptr.value))) + // store will be commited iff both sta & std have been writebacked + val commit_w = VecInit(deqPtrVec.map(ptr => writebacked(ptr.value) && store_data_writebacked(ptr.value))) val commit_exception = exceptionDataRead.valid && !isAfter(exceptionDataRead.bits.roqIdx, deqPtrVec.last) val commit_block = VecInit((0 until CommitWidth).map(i => !commit_w(i))) val allowOnlyOneCommit = commit_exception || intrBitSetReg @@ -655,11 +658,14 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc for (i <- 0 until RenameWidth) { when (canEnqueue(i)) { writebacked(enqPtrVec(i).value) := false.B + val isStu = io.enq.req(i).bits.ctrl.fuType === FuType.stu + store_data_writebacked(enqPtrVec(i).value) := !isStu } } when (exceptionGen.io.out.valid) { val wbIdx = exceptionGen.io.out.bits.roqIdx.value writebacked(wbIdx) := true.B + store_data_writebacked(wbIdx) := true.B } // writeback logic set numWbPorts writebacked to true for (i <- 0 until numWbPorts) { @@ -669,6 +675,12 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc writebacked(wbIdx) := !block_wb } } + // store data writeback logic mark store as data_writebacked + for (i <- 0 until StorePipelineWidth) { + when(io.lsq.storeDataRoqWb(i).valid) { + store_data_writebacked(io.lsq.storeDataRoqWb(i).bits.value) := true.B + } + } // flagBkup // enqueue logic set 6 flagBkup at most diff --git a/src/main/scala/xiangshan/mem/MemUtils.scala b/src/main/scala/xiangshan/mem/MemUtils.scala index 2132d6877a9b967d7ccf9e47b197bbad6609dbb5..433035e8d530e331e293cec0b15f076b9cf08e85 100644 --- a/src/main/scala/xiangshan/mem/MemUtils.scala +++ b/src/main/scala/xiangshan/mem/MemUtils.scala @@ -50,6 +50,11 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundle { val forwardData = Vec(8, UInt(8.W)) } +class StoreDataBundle(implicit p: Parameters) extends XSBundle { + val data = UInt((XLEN+1).W) + val uop = new MicroOp +} + class LoadForwardQueryIO(implicit p: Parameters) extends XSBundle { val paddr = Output(UInt(PAddrBits.W)) val mask = Output(UInt(8.W)) @@ -62,9 +67,17 @@ class LoadForwardQueryIO(implicit p: Parameters) extends XSBundle { // val lqIdx = Output(UInt(LoadQueueIdxWidth.W)) val sqIdx = Output(new SqPtr) + + val dataInvalid = Input(Bool()) // Addr match, but data is not valid for now + // If dataInvalid, load inst should sleep for a while + // Feedback type should be RSFeedbackType.dataInvalid } -class MaskedLoadForwardQueryIO(implicit p: Parameters) extends XSBundle { +// LoadForwardQueryIO used in load pipeline +// +// Difference between PipeLoadForwardQueryIO and LoadForwardQueryIO: +// PipeIO use predecoded sqIdxMask for better forward timing +class PipeLoadForwardQueryIO(implicit p: Parameters) extends XSBundle { val paddr = Output(UInt(PAddrBits.W)) val mask = Output(UInt(8.W)) val uop = Output(new MicroOp) // for replay @@ -74,7 +87,13 @@ class MaskedLoadForwardQueryIO(implicit p: Parameters) extends XSBundle { val forwardMask = Input(Vec(8, Bool())) val forwardData = Input(Vec(8, UInt(8.W))) - val sqIdx = Output(new SqPtr) // for debug + val sqIdx = Output(new SqPtr) // for debug, should not be used in pipeline for timing reasons // sqIdxMask is calcuated in earlier stage for better timing val sqIdxMask = Output(UInt(StoreQueueSize.W)) + + // dataInvalid: addr match, but data is not valid for now + val dataInvalidFast = Input(Bool()) // resp to load_s1 + val dataInvalid = Input(Bool()) // resp to load_s2 + // If dataInvalid, load inst should sleep for a while + // Feedback type should be RSFeedbackType.dataInvalid } diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index 28869e5ba7276f2b20cc57d467196250991aadb7..68cce1f4792b66db0c6a78df65332855cec0fe99 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -42,12 +42,13 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet val flush = Input(Bool()) val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle))) val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) + val storeDataIn = Vec(StorePipelineWidth, Flipped(Valid(new StoreDataBundle))) // store data, send to sq from rs val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool())) val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool())) val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store - val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO)) + val forward = Vec(LoadPipelineWidth, Flipped(new PipeLoadForwardQueryIO)) val roq = Flipped(new RoqLsqIO) val rollback = Output(Valid(new Redirect)) val dcache = Flipped(ValidIO(new Refill)) @@ -101,6 +102,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet storeQueue.io.brqRedirect <> io.brqRedirect storeQueue.io.flush <> io.flush storeQueue.io.storeIn <> io.storeIn + storeQueue.io.storeDataIn <> io.storeDataIn storeQueue.io.sbuffer <> io.sbuffer storeQueue.io.mmioStout <> io.mmioStout storeQueue.io.roq <> io.roq diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala index f2a7d73ad708f50a256667c0f4b3cacf222c6ccc..4cd08f4387145eef0cbf83ec7fb8f803006c6243 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala @@ -75,7 +75,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool())) val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool())) val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load - val load_s1 = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO)) + val load_s1 = Vec(LoadPipelineWidth, Flipped(new PipeLoadForwardQueryIO)) val roq = Flipped(new RoqLsqIO) val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store val dcache = Flipped(ValidIO(new Refill)) @@ -644,6 +644,11 @@ class LoadQueue(implicit p: Parameters) extends XSModule allowEnqueue := validCount + enqNumber <= (LoadQueueSize - RenameWidth).U + /** + * misc + */ + io.roq.storeDataRoqWb := DontCare // will be overwriten by store queue's result + // perf counter QueuePerf(LoadQueueSize, validCount, !allowEnqueue) io.lqFull := !allowEnqueue diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index a9715c1dfbe97a8e345904647465c35e9c432553..bb5d83a59513732f63741ddaf93eee37ffe21669 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -7,7 +7,7 @@ import utils._ import xiangshan._ import xiangshan.cache._ import xiangshan.cache.{DCacheWordIO, DCacheLineIO, TlbRequestIO, MemoryOpConstants} -import xiangshan.backend.roq.RoqLsqIO +import xiangshan.backend.roq.{RoqLsqIO, RoqPtr} import difftest._ class SqPtr(implicit p: Parameters) extends CircularQueuePtr[SqPtr]( @@ -39,24 +39,25 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete val enq = new SqEnqIO val brqRedirect = Flipped(ValidIO(new Redirect)) val flush = Input(Bool()) - val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) - val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) + val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) // store addr, data is not included + val storeDataIn = Vec(StorePipelineWidth, Flipped(Valid(new StoreDataBundle))) // store data, send to sq from rs + val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) // write commited store to sbuffer val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store - val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO)) + val forward = Vec(LoadPipelineWidth, Flipped(new PipeLoadForwardQueryIO)) val roq = Flipped(new RoqLsqIO) val uncache = new DCacheWordIO // val refill = Flipped(Valid(new DCacheLineReq )) val exceptionAddr = new ExceptionAddrIO val sqempty = Output(Bool()) - val issuePtrExt = Output(new SqPtr) - val storeIssue = Vec(StorePipelineWidth, Flipped(Valid(new ExuInput))) + val issuePtrExt = Output(new SqPtr) // used to wake up delayed load/store + val storeIssue = Vec(StorePipelineWidth, Flipped(Valid(new ExuInput))) // used to update issuePtrExt val sqFull = Output(Bool()) }) // data modules val uop = Reg(Vec(StoreQueueSize, new MicroOp)) // val data = Reg(Vec(StoreQueueSize, new LsqEntry)) - val dataModule = Module(new StoreQueueData(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth)) + val dataModule = Module(new SQDataModule(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth)) dataModule.io := DontCare val paddrModule = Module(new SQPaddrModule(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth)) paddrModule.io := DontCare @@ -65,8 +66,9 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete // state & misc val allocated = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // sq entry has been allocated + val addrvalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // non-mmio addr is valid val datavalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // non-mmio data is valid - val writebacked = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // inst has been writebacked to CDB + val allvalid = VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && datavalid(i))) // non-mmio data & addr is valid val issued = Reg(Vec(StoreQueueSize, Bool())) // inst has been issued by rs val commited = Reg(Vec(StoreQueueSize, Bool())) // inst has been commited by roq val pending = Reg(Vec(StoreQueueSize, Bool())) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of roq @@ -123,7 +125,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete uop(index) := io.enq.req(i).bits allocated(index) := true.B datavalid(index) := false.B - writebacked(index) := false.B + addrvalid(index) := false.B issued(index) := false.B commited(index) := false.B pending(index) := false.B @@ -168,7 +170,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete * * Most store instructions writeback to regfile in the previous cycle. * However, - * (1) For an mmio instruction with exceptions, we need to mark it as datavalid + * (1) For an mmio instruction with exceptions, we need to mark it as addrvalid * (in this way it will trigger an exception when it reaches ROB's head) * instead of pending to avoid sending them to lower level. * (2) For an mmio instruction without exceptions, we mark it as pending. @@ -176,39 +178,33 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete * Upon receiving the response, StoreQueue writes back the instruction * through arbiter with store units. It will later commit as normal. */ + + // Write addr to sq for (i <- 0 until StorePipelineWidth) { - dataModule.io.wen(i) := false.B paddrModule.io.wen(i) := false.B + dataModule.io.mask.wen(i) := false.B val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value when (io.storeIn(i).fire()) { - datavalid(stWbIndex) := !io.storeIn(i).bits.mmio - writebacked(stWbIndex) := !io.storeIn(i).bits.mmio + addrvalid(stWbIndex) := true.B//!io.storeIn(i).bits.mmio pending(stWbIndex) := io.storeIn(i).bits.mmio - val storeWbData = Wire(new SQDataEntry) - storeWbData := DontCare - storeWbData.mask := io.storeIn(i).bits.mask - storeWbData.data := io.storeIn(i).bits.data - - dataModule.io.waddr(i) := stWbIndex - dataModule.io.wdata(i) := storeWbData - dataModule.io.wen(i) := true.B + dataModule.io.mask.waddr(i) := stWbIndex + dataModule.io.mask.wdata(i) := io.storeIn(i).bits.mask + dataModule.io.mask.wen(i) := true.B paddrModule.io.waddr(i) := stWbIndex paddrModule.io.wdata(i) := io.storeIn(i).bits.paddr paddrModule.io.wen(i) := true.B - mmio(stWbIndex) := io.storeIn(i).bits.mmio - XSInfo("store write to sq idx %d pc 0x%x vaddr %x paddr %x data %x mmio %x\n", + XSInfo("store addr write to sq idx %d pc 0x%x vaddr %x paddr %x mmio %x\n", io.storeIn(i).bits.uop.sqIdx.value, io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.vaddr, io.storeIn(i).bits.paddr, - io.storeIn(i).bits.data, io.storeIn(i).bits.mmio - ) + ) } // vaddrModule write is delayed, as vaddrModule will not be read right after write vaddrModule.io.waddr(i) := RegNext(stWbIndex) @@ -216,6 +212,31 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete vaddrModule.io.wen(i) := RegNext(io.storeIn(i).fire()) } + // Write data to sq + for (i <- 0 until StorePipelineWidth) { + dataModule.io.data.wen(i) := false.B + io.roq.storeDataRoqWb(i).valid := false.B + io.roq.storeDataRoqWb(i).bits := DontCare + val stWbIndex = io.storeDataIn(i).bits.uop.sqIdx.value + when (io.storeDataIn(i).fire()) { + datavalid(stWbIndex) := true.B + + dataModule.io.data.waddr(i) := stWbIndex + dataModule.io.data.wdata(i) := genWdata(io.storeDataIn(i).bits.data, io.storeDataIn(i).bits.uop.ctrl.fuOpType(1,0)) + dataModule.io.data.wen(i) := true.B + + io.roq.storeDataRoqWb(i).valid := true.B + io.roq.storeDataRoqWb(i).bits := io.storeDataIn(i).bits.uop.roqIdx + + XSInfo("store data write to sq idx %d pc 0x%x data %x -> %x\n", + io.storeDataIn(i).bits.uop.sqIdx.value, + io.storeDataIn(i).bits.uop.cf.pc, + io.storeDataIn(i).bits.data, + dataModule.io.data.wdata(i) + ) + } + } + /** * load forward query * @@ -235,25 +256,33 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete // i.e. forward1 is the target entries with the same flag bits and forward2 otherwise val differentFlag = deqPtrExt(0).flag =/= io.forward(i).sqIdx.flag val forwardMask = io.forward(i).sqIdxMask - val storeWritebackedVec = WireInit(VecInit(Seq.fill(StoreQueueSize)(false.B))) - for (j <- 0 until StoreQueueSize) { - storeWritebackedVec(j) := datavalid(j) && allocated(j) // all datavalid terms need to be checked - } - val needForward1 = Mux(differentFlag, ~deqMask, deqMask ^ forwardMask) & storeWritebackedVec.asUInt - val needForward2 = Mux(differentFlag, forwardMask, 0.U(StoreQueueSize.W)) & storeWritebackedVec.asUInt - - XSDebug(p"$i f1 ${Binary(needForward1)} f2 ${Binary(needForward2)} " + + // all addrvalid terms need to be checked + val addrValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && allocated(i)))) + val dataValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => datavalid(i)))) + val allValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && datavalid(i) && allocated(i)))) + val canForward1 = Mux(differentFlag, ~deqMask, deqMask ^ forwardMask) & allValidVec.asUInt + val canForward2 = Mux(differentFlag, forwardMask, 0.U(StoreQueueSize.W)) & allValidVec.asUInt + val needForward = Mux(differentFlag, ~deqMask | forwardMask, deqMask ^ forwardMask) + + XSDebug(p"$i f1 ${Binary(canForward1)} f2 ${Binary(canForward2)} " + p"sqIdx ${io.forward(i).sqIdx} pa ${Hexadecimal(io.forward(i).paddr)}\n" ) - // do real fwd query - dataModule.io.needForward(i)(0) := needForward1 & paddrModule.io.forwardMmask(i).asUInt - dataModule.io.needForward(i)(1) := needForward2 & paddrModule.io.forwardMmask(i).asUInt + // do real fwd query (cam lookup in load_s1) + dataModule.io.needForward(i)(0) := canForward1 & paddrModule.io.forwardMmask(i).asUInt + dataModule.io.needForward(i)(1) := canForward2 & paddrModule.io.forwardMmask(i).asUInt paddrModule.io.forwardMdata(i) := io.forward(i).paddr + // Forward result will be generated 1 cycle later (load_s2) io.forward(i).forwardMask := dataModule.io.forwardMask(i) io.forward(i).forwardData := dataModule.io.forwardData(i) + + // If addr match, data not ready, mark it as dataInvalid + // load_s1: generate dataInvalid in load_s1 to set fastUop to + io.forward(i).dataInvalidFast := (addrValidVec.asUInt & ~dataValidVec.asUInt & paddrModule.io.forwardMmask(i).asUInt & needForward).orR + // load_s2 + io.forward(i).dataInvalid := RegNext(io.forward(i).dataInvalidFast) } /** @@ -262,7 +291,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete * States: * (1) writeback from store units: mark as pending * (2) when they reach ROB's head, they can be sent to uncache channel - * (3) response from uncache channel: mark as datavalid + * (3) response from uncache channel: mark as datavalidmask.wen * (4) writeback to ROB (and other units): mark as writebacked * (5) ROB commits the instruction: same as normal instructions */ @@ -271,7 +300,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete val uncacheState = RegInit(s_idle) switch(uncacheState) { is(s_idle) { - when(io.roq.pendingst && pending(deqPtr) && allocated(deqPtr)) { + when(io.roq.pendingst && pending(deqPtr) && allocated(deqPtr) && datavalid(deqPtr) && addrvalid(deqPtr)) { uncacheState := s_req } } @@ -306,6 +335,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete io.uncache.req.bits.id := DontCare when(io.uncache.req.fire()){ + // mmio store should not be committed until uncache req is sent pending(deqPtr) := false.B XSDebug( @@ -319,12 +349,9 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete // (3) response from uncache channel: mark as datavalid io.uncache.resp.ready := true.B - when (io.uncache.resp.fire()) { - datavalid(deqPtr) := true.B - } // (4) writeback to ROB (and other units): mark as writebacked - io.mmioStout.valid := uncacheState === s_wb // allocated(deqPtr) && datavalid(deqPtr) && !writebacked(deqPtr) + io.mmioStout.valid := uncacheState === s_wb io.mmioStout.bits.uop := uop(deqPtr) io.mmioStout.bits.uop.sqIdx := deqPtrExt(0) io.mmioStout.bits.data := dataModule.io.rdata(0).data // dataModule.io.rdata.read(deqPtr) @@ -335,7 +362,6 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete io.mmioStout.bits.debug.isPerfCnt := false.B io.mmioStout.bits.fflags := DontCare when (io.mmioStout.fire()) { - writebacked(deqPtr) := true.B allocated(deqPtr) := false.B } @@ -360,6 +386,8 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete // if !sbuffer.fire(), read the same ptr // if sbuffer.fire(), read next io.sbuffer(i).valid := allocated(ptr) && commited(ptr) && !mmio(ptr) + // Note that store data/addr should both be valid after store's commit + assert(!io.sbuffer(i).valid || allvalid(ptr)) io.sbuffer(i).bits.cmd := MemoryOpConstants.M_XWR io.sbuffer(i).bits.addr := paddrModule.io.rdata(i) io.sbuffer(i).bits.data := dataModule.io.rdata(i).data @@ -460,10 +488,11 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete if (i % 4 == 0) XSDebug("") XSDebug(false, true.B, "%x ", uop(i).cf.pc) PrintFlag(allocated(i), "a") - PrintFlag(allocated(i) && datavalid(i), "v") - PrintFlag(allocated(i) && writebacked(i), "w") + PrintFlag(allocated(i) && addrvalid(i), "a") + PrintFlag(allocated(i) && datavalid(i), "d") PrintFlag(allocated(i) && commited(i), "c") PrintFlag(allocated(i) && pending(i), "p") + PrintFlag(allocated(i) && mmio(i), "m") XSDebug(false, true.B, " ") if (i % 4 == 3 || i == StoreQueueSize - 1) XSDebug(false, true.B, "\n") } diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala index 8a7d98750dd2a362af13a51be3c36dffac53f3ed..1087fab7ab0f7305a8c76252581d100997f34532 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala @@ -63,9 +63,16 @@ class SQData8Module(size: Int, numRead: Int, numWrite: Int, numForward: Int)(imp val io = IO(new Bundle() { val raddr = Vec(numRead, Input(UInt(log2Up(size).W))) val rdata = Vec(numRead, Output(new SQData8Entry)) - val wen = Vec(numWrite, Input(Bool())) - val waddr = Vec(numWrite, Input(UInt(log2Up(size).W))) - val wdata = Vec(numWrite, Input(new SQData8Entry)) + val data = new Bundle() { + val wen = Vec(numWrite, Input(Bool())) + val waddr = Vec(numWrite, Input(UInt(log2Up(size).W))) + val wdata = Vec(numWrite, Input(UInt((XLEN/8).W))) + } + val mask = new Bundle() { + val wen = Vec(numWrite, Input(Bool())) + val waddr = Vec(numWrite, Input(UInt(log2Up(size).W))) + val wdata = Vec(numWrite, Input(Bool())) + } val needForward = Input(Vec(numForward, Vec(2, UInt(size.W)))) val forwardValid = Vec(numForward, Output(Bool())) @@ -76,10 +83,15 @@ class SQData8Module(size: Int, numRead: Int, numWrite: Int, numForward: Int)(imp val data = Reg(Vec(size, new SQData8Entry)) - // writeback to lq/sq + // writeback to sq (0 until numWrite).map(i => { - when(io.wen(i)){ - data(io.waddr(i)) := io.wdata(i) + when(io.data.wen(i)){ + data(io.data.waddr(i)).data := io.data.wdata(i) + } + }) + (0 until numWrite).map(i => { + when(io.mask.wen(i)){ + data(io.mask.waddr(i)).valid := io.mask.wdata(i) } }) @@ -91,7 +103,12 @@ class SQData8Module(size: Int, numRead: Int, numWrite: Int, numForward: Int)(imp // DataModuleTemplate should not be used when there're any write conflicts for (i <- 0 until numWrite) { for (j <- i+1 until numWrite) { - assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j))) + assert(!(io.data.wen(i) && io.data.wen(j) && io.data.waddr(i) === io.data.waddr(j))) + } + } + for (i <- 0 until numWrite) { + for (j <- i+1 until numWrite) { + assert(!(io.mask.wen(i) && io.mask.wen(j) && io.mask.waddr(i) === io.mask.waddr(j))) } } @@ -150,13 +167,20 @@ class SQDataEntry(implicit p: Parameters) extends XSBundle { val data = UInt(XLEN.W) } -class StoreQueueData(size: Int, numRead: Int, numWrite: Int, numForward: Int)(implicit p: Parameters) extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper { +class SQDataModule(size: Int, numRead: Int, numWrite: Int, numForward: Int)(implicit p: Parameters) extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper { val io = IO(new Bundle() { val raddr = Vec(numRead, Input(UInt(log2Up(size).W))) val rdata = Vec(numRead, Output(new SQDataEntry)) - val wen = Vec(numWrite, Input(Bool())) - val waddr = Vec(numWrite, Input(UInt(log2Up(size).W))) - val wdata = Vec(numWrite, Input(new SQDataEntry)) + val data = new Bundle() { + val wen = Vec(numWrite, Input(Bool())) + val waddr = Vec(numWrite, Input(UInt(log2Up(size).W))) + val wdata = Vec(numWrite, Input(UInt(XLEN.W))) + } + val mask = new Bundle() { + val wen = Vec(numWrite, Input(Bool())) + val waddr = Vec(numWrite, Input(UInt(log2Up(size).W))) + val wdata = Vec(numWrite, Input(UInt(8.W))) + } val needForward = Input(Vec(numForward, Vec(2, UInt(size.W)))) val forwardMask = Vec(numForward, Output(Vec(8, Bool()))) @@ -169,10 +193,12 @@ class StoreQueueData(size: Int, numRead: Int, numWrite: Int, numForward: Int)(im for (i <- 0 until numWrite) { // write to data8 for (j <- 0 until 8) { - data8(j).io.waddr(i) := io.waddr(i) - data8(j).io.wdata(i).valid := io.wdata(i).mask(j) - data8(j).io.wdata(i).data := io.wdata(i).data(8*(j+1)-1, 8*j) - data8(j).io.wen(i) := io.wen(i) + data8(j).io.mask.waddr(i) := io.mask.waddr(i) + data8(j).io.mask.wdata(i) := io.mask.wdata(i)(j) + data8(j).io.mask.wen(i) := io.mask.wen(i) + data8(j).io.data.waddr(i) := io.data.waddr(i) + data8(j).io.data.wdata(i) := io.data.wdata(i)(8*(j+1)-1, 8*j) + data8(j).io.data.wen(i) := io.data.wen(i) } } @@ -188,7 +214,12 @@ class StoreQueueData(size: Int, numRead: Int, numWrite: Int, numForward: Int)(im // DataModuleTemplate should not be used when there're any write conflicts for (i <- 0 until numWrite) { for (j <- i+1 until numWrite) { - assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j))) + assert(!(io.data.wen(i) && io.data.wen(j) && io.data.waddr(i) === io.data.waddr(j))) + } + } + for (i <- 0 until numWrite) { + for (j <- i+1 until numWrite) { + assert(!(io.mask.wen(i) && io.mask.wen(j) && io.mask.waddr(i) === io.mask.waddr(j))) } } diff --git a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala index 38bf8518077aa7aa06b8b9c4559f49475d507aa5..7480e5da17aef98eeaf9cd25d0eaa8680b86dd8a 100644 --- a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala @@ -11,12 +11,13 @@ import difftest._ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstants{ val io = IO(new Bundle() { val in = Flipped(Decoupled(new ExuInput)) + val storeDataIn = Flipped(Valid(new StoreDataBundle)) // src2 from rs val out = Decoupled(new ExuOutput) val dcache = new DCacheWordIO val dtlb = new TlbRequestIO val rsIdx = Input(UInt(log2Up(IssQueSize).W)) val flush_sbuffer = new SbufferFlushBundle - val tlbFeedback = ValidIO(new TlbFeedback) + val rsFeedback = ValidIO(new RSFeedback) val redirect = Flipped(ValidIO(new Redirect)) val flush = Input(Bool()) val exceptionAddr = ValidIO(UInt(VAddrBits.W)) @@ -27,6 +28,8 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant //------------------------------------------------------- val s_invalid :: s_tlb :: s_flush_sbuffer_req :: s_flush_sbuffer_resp :: s_cache_req :: s_cache_resp :: s_finish :: Nil = Enum(7) val state = RegInit(s_invalid) + val addr_valid = RegInit(false.B) + val data_valid = RegInit(false.B) val in = Reg(new ExuInput()) val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec())) val atom_override_xtval = RegInit(false.B) @@ -68,18 +71,30 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant io.in.ready := true.B when (io.in.fire()) { in := io.in.bits + in.src2 := in.src2 // leave src2 unchanged + addr_valid := true.B + } + when (io.storeDataIn.fire()) { + in.src2 := io.storeDataIn.bits.data + data_valid := true.B + } + when(data_valid && addr_valid) { state := s_tlb + addr_valid := false.B + data_valid := false.B } } + // Send TLB feedback to store issue queue // we send feedback right after we receives request // also, we always treat amo as tlb hit // since we will continue polling tlb all by ourself - io.tlbFeedback.valid := RegNext(RegNext(io.in.valid)) - io.tlbFeedback.bits.hit := true.B - io.tlbFeedback.bits.rsIdx := RegEnable(io.rsIdx, io.in.valid) - io.tlbFeedback.bits.flushState := DontCare + io.rsFeedback.valid := RegNext(RegNext(io.in.valid)) + io.rsFeedback.bits.hit := true.B + io.rsFeedback.bits.rsIdx := RegEnable(io.rsIdx, io.in.valid) + io.rsFeedback.bits.flushState := DontCare + io.rsFeedback.bits.sourceType := DontCare // tlb translation, manipulating signals && deal with exception when (state === s_tlb) { diff --git a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala index fcfcf71b426b43933337570f6ef2d60199446724..918b1535beba41cb64ea7723cf101d0224e1ecb0 100644 --- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala @@ -13,7 +13,7 @@ class LoadToLsqIO(implicit p: Parameters) extends XSBundle { val ldout = Flipped(DecoupledIO(new ExuOutput)) val loadDataForwarded = Output(Bool()) val needReplayFromRS = Output(Bool()) - val forward = new MaskedLoadForwardQueryIO + val forward = new PipeLoadForwardQueryIO } // Load Pipeline Stage 0 @@ -99,7 +99,7 @@ class LoadUnit_S1(implicit p: Parameters) extends XSModule { val dcachePAddr = Output(UInt(PAddrBits.W)) val dcacheKill = Output(Bool()) val sbuffer = new LoadForwardQueryIO - val lsq = new MaskedLoadForwardQueryIO + val lsq = new PipeLoadForwardQueryIO }) val s1_uop = io.in.bits.uop @@ -156,7 +156,7 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper { val io = IO(new Bundle() { val in = Flipped(Decoupled(new LsPipelineBundle)) val out = Decoupled(new LsPipelineBundle) - val tlbFeedback = ValidIO(new TlbFeedback) + val rsFeedback = ValidIO(new RSFeedback) val dcacheResp = Flipped(DecoupledIO(new DCacheWordResp)) val lsq = new LoadForwardQueryIO val sbuffer = new LoadForwardQueryIO @@ -168,6 +168,7 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper { val s2_mask = io.in.bits.mask val s2_paddr = io.in.bits.paddr val s2_tlb_miss = io.in.bits.tlbMiss + val s2_data_invalid = io.lsq.dataInvalid val s2_exception = selectLoad(io.in.bits.uop.cf.exceptionVec, false).asUInt.orR val s2_mmio = io.in.bits.mmio && !s2_exception val s2_cache_miss = io.dcacheResp.bits.miss @@ -178,10 +179,18 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper { assert(!(io.in.valid && dcacheShouldResp && !io.dcacheResp.valid), "DCache response got lost") // feedback tlb result to RS - io.tlbFeedback.valid := io.in.valid - io.tlbFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception) - io.tlbFeedback.bits.rsIdx := io.in.bits.rsIdx - io.tlbFeedback.bits.flushState := io.in.bits.ptwBack + io.rsFeedback.valid := io.in.valid + io.rsFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception) && !s2_data_invalid + io.rsFeedback.bits.rsIdx := io.in.bits.rsIdx + io.rsFeedback.bits.flushState := io.in.bits.ptwBack + io.rsFeedback.bits.sourceType := Mux(s2_tlb_miss, RSFeedbackType.tlbMiss, + Mux(io.lsq.dataInvalid, + RSFeedbackType.dataInvalid, + RSFeedbackType.mshrFull + ) + ) + + // s2_cache_replay is quite slow to generate, send it separately to LQ io.needReplayFromRS := s2_cache_replay // merge forward result @@ -189,7 +198,7 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper { val forwardMask = Wire(Vec(8, Bool())) val forwardData = Wire(Vec(8, UInt(8.W))) - val fullForward = (~forwardMask.asUInt & s2_mask) === 0.U + val fullForward = (~forwardMask.asUInt & s2_mask) === 0.U && !io.lsq.dataInvalid io.lsq := DontCare io.sbuffer := DontCare @@ -221,7 +230,7 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper { )) val rdataPartialLoad = rdataHelper(s2_uop, rdataSel) - io.out.valid := io.in.valid && !s2_tlb_miss + io.out.valid := io.in.valid && !s2_tlb_miss && !s2_data_invalid // Inst will be canceled in store queue / lsq, // so we do not need to care about flush in load / store unit's out.valid io.out.bits := io.in.bits @@ -253,9 +262,9 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper { XSPerfAccumulate("dcache_miss", io.in.valid && s2_cache_miss) XSPerfAccumulate("full_forward", io.in.valid && fullForward) XSPerfAccumulate("dcache_miss_full_forward", io.in.valid && s2_cache_miss && fullForward) - XSPerfAccumulate("replay", io.tlbFeedback.valid && !io.tlbFeedback.bits.hit) - XSPerfAccumulate("replay_tlb_miss", io.tlbFeedback.valid && !io.tlbFeedback.bits.hit && s2_tlb_miss) - XSPerfAccumulate("replay_cache", io.tlbFeedback.valid && !io.tlbFeedback.bits.hit && !s2_tlb_miss && s2_cache_replay) + XSPerfAccumulate("replay", io.rsFeedback.valid && !io.rsFeedback.bits.hit) + XSPerfAccumulate("replay_tlb_miss", io.rsFeedback.valid && !io.rsFeedback.bits.hit && s2_tlb_miss) + XSPerfAccumulate("replay_cache", io.rsFeedback.valid && !io.rsFeedback.bits.hit && !s2_tlb_miss && s2_cache_replay) XSPerfAccumulate("stall_out", io.out.valid && !io.out.ready) } @@ -265,7 +274,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper { val ldout = Decoupled(new ExuOutput) val redirect = Flipped(ValidIO(new Redirect)) val flush = Input(Bool()) - val tlbFeedback = ValidIO(new TlbFeedback) + val rsFeedback = ValidIO(new RSFeedback) val rsIdx = Input(UInt(log2Up(IssQueSize).W)) val isFirstIssue = Input(Bool()) val dcache = new DCacheLoadIO @@ -298,11 +307,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper { load_s2.io.dcacheResp <> io.dcache.resp load_s2.io.lsq.forwardData <> io.lsq.forward.forwardData load_s2.io.lsq.forwardMask <> io.lsq.forward.forwardMask + load_s2.io.lsq.dataInvalid <> io.lsq.forward.dataInvalid load_s2.io.sbuffer.forwardData <> io.sbuffer.forwardData load_s2.io.sbuffer.forwardMask <> io.sbuffer.forwardMask + load_s2.io.sbuffer.dataInvalid <> io.sbuffer.dataInvalid // always false load_s2.io.dataForwarded <> io.lsq.loadDataForwarded - io.tlbFeedback.bits := RegNext(load_s2.io.tlbFeedback.bits) - io.tlbFeedback.valid := RegNext(load_s2.io.tlbFeedback.valid && !load_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush)) + io.rsFeedback.bits := RegNext(load_s2.io.rsFeedback.bits) + io.rsFeedback.valid := RegNext(load_s2.io.rsFeedback.valid && !load_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush)) io.lsq.needReplayFromRS := load_s2.io.needReplayFromRS // pre-calcuate sqIdx mask in s0, then send it to lsq in s1 for forwarding @@ -313,7 +324,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper { // load_s2.io.dcacheResp.bits.data := Mux1H(RegNext(io.dcache.s1_hit_way), RegNext(io.dcache.s1_data)) // assert(load_s2.io.dcacheResp.bits.data === io.dcache.resp.bits.data) - io.fastUop.valid := io.dcache.s1_hit_way.orR && !io.dcache.s1_disable_fast_wakeup && load_s1.io.in.valid && !load_s1.io.dcacheKill + io.fastUop.valid := io.dcache.s1_hit_way.orR && !io.dcache.s1_disable_fast_wakeup && load_s1.io.in.valid && + !load_s1.io.dcacheKill && !io.lsq.forward.dataInvalidFast io.fastUop.bits := load_s1.io.out.bits.uop XSDebug(load_s0.io.out.valid, diff --git a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala index 02411cde644ee5a1f056b76576761de4226d4e4c..78d9881899ec9fe5f89b1c5ba87df9103aaf65dc 100644 --- a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala @@ -39,7 +39,9 @@ class StoreUnit_S0(implicit p: Parameters) extends XSModule { io.out.bits := DontCare io.out.bits.vaddr := saddr - io.out.bits.data := genWdata(io.in.bits.src2, io.in.bits.uop.ctrl.fuOpType(1,0)) + // Now data use its own io + // io.out.bits.data := genWdata(io.in.bits.src2, io.in.bits.uop.ctrl.fuOpType(1,0)) + io.out.bits.data := io.in.bits.src2 // FIXME: remove data from pipeline io.out.bits.uop := io.in.bits.uop io.out.bits.miss := DontCare io.out.bits.rsIdx := io.rsIdx @@ -70,7 +72,7 @@ class StoreUnit_S1(implicit p: Parameters) extends XSModule { val out = Decoupled(new LsPipelineBundle) val lsq = ValidIO(new LsPipelineBundle) val dtlbResp = Flipped(DecoupledIO(new TlbResp)) - val tlbFeedback = ValidIO(new TlbFeedback) + val rsFeedback = ValidIO(new RSFeedback) }) val s1_paddr = io.dtlbResp.bits.paddr @@ -83,14 +85,15 @@ class StoreUnit_S1(implicit p: Parameters) extends XSModule { io.dtlbResp.ready := true.B // TODO: why dtlbResp needs a ready? // Send TLB feedback to store issue queue - io.tlbFeedback.valid := io.in.valid - io.tlbFeedback.bits.hit := !s1_tlb_miss - io.tlbFeedback.bits.flushState := io.dtlbResp.bits.ptwBack - io.tlbFeedback.bits.rsIdx := io.in.bits.rsIdx - XSDebug(io.tlbFeedback.valid, + io.rsFeedback.valid := io.in.valid + io.rsFeedback.bits.hit := !s1_tlb_miss + io.rsFeedback.bits.flushState := io.dtlbResp.bits.ptwBack + io.rsFeedback.bits.rsIdx := io.in.bits.rsIdx + io.rsFeedback.bits.sourceType := RSFeedbackType.tlbMiss + XSDebug(io.rsFeedback.valid, "S1 Store: tlbHit: %d roqIdx: %d\n", - io.tlbFeedback.bits.hit, - io.tlbFeedback.bits.rsIdx + io.rsFeedback.bits.hit, + io.rsFeedback.bits.rsIdx ) @@ -146,7 +149,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule { val stin = Flipped(Decoupled(new ExuInput)) val redirect = Flipped(ValidIO(new Redirect)) val flush = Input(Bool()) - val tlbFeedback = ValidIO(new TlbFeedback) + val rsFeedback = ValidIO(new RSFeedback) val dtlb = new TlbRequestIO() val rsIdx = Input(UInt(log2Up(IssQueSize).W)) val isFirstIssue = Input(Bool()) @@ -168,7 +171,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule { store_s1.io.lsq <> io.lsq // send result to sq store_s1.io.dtlbResp <> io.dtlb.resp - store_s1.io.tlbFeedback <> io.tlbFeedback + store_s1.io.rsFeedback <> io.rsFeedback PipelineConnect(store_s1.io.out, store_s2.io.in, true.B, store_s1.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush)) diff --git a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala index bfc79363a995b9bc1051f77669d463ab2c46e8c3..a2ae039af7a8b808c5b7ae6efc3f4c0e0c4bc81e 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala @@ -387,6 +387,7 @@ class NewSbuffer(implicit p: Parameters) extends XSModule with HasSbufferConst { val selectedInflightMask = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool())))) val selectedInflightData = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))) + forward.dataInvalid := false.B // data in store line merge buffer is always ready for (j <- 0 until DataBytes) { forward.forwardMask(j) := false.B forward.forwardData(j) := DontCare