未验证 提交 1b7adedc 编写于 作者: W William Wang 提交者: GitHub

MemBlock: split store addr and store data (#781)

* RSFeedback: add source type

* StoreQueue: split store addr and store data

* StoreQueue: update ls forward logic

* Now it supports splited addr and data

* Chore: force assign name for load/store unit

* RS: add rs'support for store a-d split

* StoreQueue: fix stlf logic

* StoreQueue: fix addr wb sq update logic

* AtomicsUnit: support splited a/d

* StoreQueue: add sbuffer enq condition assertion

Store data op (std) may still be invalid after store addr op's (sta)
commitment, so datavalid needs to be checked before commiting
store data to sbuffer

Note that at current commit a non-completed std op for a
commited store may exist. We should make sure that uop
will not be cancelled by a latter branch mispredict. More work
to be done!

* Roq: add std/sta split writeback logic

Now store will commit only if both sta & std have been writebacked
Co-authored-by: NZhangZifei <zhangzifei20z@ict.ac.cn>
上级 68f25d38
......@@ -51,6 +51,14 @@ object ValidUndirectioned {
}
}
object RSFeedbackType {
val tlbMiss = 0.U(2.W)
val mshrFull = 1.U(2.W)
val dataInvalid = 2.U(2.W)
def apply() = UInt(2.W)
}
class SCMeta(val useSC: Boolean)(implicit p: Parameters) extends XSBundle with HasSCParameter {
val tageTaken = if (useSC) Bool() else UInt(0.W)
val scUsed = if (useSC) Bool() else UInt(0.W)
......@@ -407,14 +415,13 @@ class RoqCommitIO(implicit p: Parameters) extends XSBundle {
def hasCommitInstr = !isWalk && valid.asUInt.orR
}
class TlbFeedback(implicit p: Parameters) extends XSBundle {
class RSFeedback(implicit p: Parameters) extends XSBundle {
val rsIdx = UInt(log2Up(IssQueSize).W)
val hit = Bool()
val flushState = Bool()
val sourceType = RSFeedbackType()
}
class RSFeedback(implicit p: Parameters) extends TlbFeedback
class FrontendToBackendIO(implicit p: Parameters) extends XSBundle {
// to backend end
val cfVec = Vec(DecodeWidth, DecoupledIO(new CtrlFlow))
......
......@@ -101,6 +101,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val storeUnits = Seq.fill(exuParameters.StuCnt)(Module(new StoreUnit))
val exeUnits = loadUnits ++ storeUnits
loadUnits.zipWithIndex.map(x => x._1.suggestName("LoadUnit_"+x._2))
storeUnits.zipWithIndex.map(x => x._1.suggestName("StoreUnit_"+x._2))
val atomicsUnit = Module(new AtomicsUnit)
val loadWritebackOverride = Mux(atomicsUnit.io.out.valid, atomicsUnit.io.out.bits, loadUnits.head.io.ldout.bits)
......@@ -221,7 +224,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
for (i <- 0 until exuParameters.LduCnt) {
loadUnits(i).io.redirect <> io.fromCtrlBlock.redirect
loadUnits(i).io.flush <> io.fromCtrlBlock.flush
loadUnits(i).io.tlbFeedback <> reservationStations(i).io.memfeedback
loadUnits(i).io.rsFeedback <> reservationStations(i).io.memfeedback
loadUnits(i).io.rsIdx := reservationStations(i).io.rsIdx // TODO: beautify it
loadUnits(i).io.isFirstIssue := reservationStations(i).io.isFirstIssue // NOTE: just for dtlb's perf cnt
loadUnits(i).io.dtlb <> dtlb.io.requestor(i)
......@@ -255,13 +258,16 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
stu.io.redirect <> io.fromCtrlBlock.redirect
stu.io.flush <> io.fromCtrlBlock.flush
stu.io.tlbFeedback <> rs.io.memfeedback
stu.io.rsFeedback <> rs.io.memfeedback
stu.io.rsIdx <> rs.io.rsIdx
stu.io.isFirstIssue <> rs.io.isFirstIssue // NOTE: just for dtlb's perf cnt
stu.io.dtlb <> dtlbReq
stu.io.stin <> rs.io.deq
stu.io.lsq <> lsq.io.storeIn(i)
// rs.io.storeData <> lsq.io.storeDataIn(i)
lsq.io.storeDataIn(i) := rs.io.stData
// sync issue info to rs
lsq.io.storeIssue(i).valid := rs.io.deq.valid
lsq.io.storeIssue(i).bits := rs.io.deq.bits
......@@ -321,6 +327,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val st0_atomics = reservationStations(atomic_rs0).io.deq.valid && FuType.storeIsAMO(reservationStations(atomic_rs0).io.deq.bits.uop.ctrl.fuType)
val st1_atomics = reservationStations(atomic_rs1).io.deq.valid && FuType.storeIsAMO(reservationStations(atomic_rs1).io.deq.bits.uop.ctrl.fuType)
val st0_data_atomics = reservationStations(atomic_rs0).io.stData.valid && FuType.storeIsAMO(reservationStations(atomic_rs0).io.stData.bits.uop.ctrl.fuType)
val st1_data_atomics = reservationStations(atomic_rs1).io.stData.valid && FuType.storeIsAMO(reservationStations(atomic_rs1).io.stData.bits.uop.ctrl.fuType)
when (st0_atomics) {
reservationStations(atomic_rs0).io.deq.ready := atomicsUnit.io.in.ready
storeUnits(0).io.stin.valid := false.B
......@@ -342,6 +351,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
atomicsUnit.io.in.valid := st0_atomics || st1_atomics
atomicsUnit.io.in.bits := Mux(st0_atomics, reservationStations(atomic_rs0).io.deq.bits, reservationStations(atomic_rs1).io.deq.bits)
atomicsUnit.io.storeDataIn.valid := st0_data_atomics || st1_data_atomics
atomicsUnit.io.storeDataIn.bits := Mux(st0_data_atomics, reservationStations(atomic_rs0).io.stData.bits, reservationStations(atomic_rs1).io.stData.bits)
atomicsUnit.io.rsIdx := Mux(st0_atomics, reservationStations(atomic_rs0).io.rsIdx, reservationStations(atomic_rs1).io.rsIdx)
atomicsUnit.io.redirect <> io.fromCtrlBlock.redirect
atomicsUnit.io.flush <> io.fromCtrlBlock.flush
......@@ -366,14 +377,14 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
}
when (state === s_atomics_0) {
atomicsUnit.io.tlbFeedback <> reservationStations(atomic_rs0).io.memfeedback
atomicsUnit.io.rsFeedback <> reservationStations(atomic_rs0).io.memfeedback
assert(!storeUnits(0).io.tlbFeedback.valid)
assert(!storeUnits(0).io.rsFeedback.valid)
}
when (state === s_atomics_1) {
atomicsUnit.io.tlbFeedback <> reservationStations(atomic_rs1).io.memfeedback
atomicsUnit.io.rsFeedback <> reservationStations(atomic_rs1).io.memfeedback
assert(!storeUnits(1).io.tlbFeedback.valid)
assert(!storeUnits(1).io.rsFeedback.valid)
}
lsq.io.exceptionAddr.lsIdx := io.lsqio.exceptionAddr.lsIdx
......
......@@ -8,7 +8,7 @@ import utils._
import xiangshan.backend.decode.{ImmUnion, Imm_U}
import xiangshan.backend.exu.{Exu, ExuConfig}
import xiangshan.backend.roq.RoqPtr
import xiangshan.mem.SqPtr
import xiangshan.mem.{SqPtr, StoreDataBundle}
import scala.math.max
......@@ -102,6 +102,7 @@ class ReservationStation
val numExist = Output(UInt(iqIdxWidth.W))
val fromDispatch = Flipped(DecoupledIO(new MicroOp))
val deq = DecoupledIO(new ExuInput)
val stData = if (exuCfg == StExeUnitCfg) ValidIO(new StoreDataBundle) else null
val srcRegValue = Input(Vec(srcNum, UInt(srcLen.W)))
val stIssuePtr = if (exuCfg == LdExeUnitCfg) Input(new SqPtr()) else null
......@@ -143,6 +144,11 @@ class ReservationStation
select.io.memfeedback := io.memfeedback
select.io.flushState := io.memfeedback.bits.flushState
}
if (exuCfg == StExeUnitCfg) {
select.io.dataReadyVec := ctrl.io.dataReadyVec
} else {
select.io.dataReadyVec := DontCare
}
ctrl.io.in.valid := select.io.enq.ready && io.fromDispatch.valid // NOTE: ctrl doesnt care redirect for timing optimization
ctrl.io.flush := io.flush
......@@ -162,6 +168,10 @@ class ReservationStation
if (exuCfg == LdExeUnitCfg) {
ctrl.io.stIssuePtr := RegNext(io.stIssuePtr)
}
if (exuCfg == StExeUnitCfg) {
ctrl.io.selData.valid := select.io.deqData.valid
ctrl.io.selData.bits := select.io.deqData.bits
}
data.io.in.valid := select.io.enq.fire()
data.io.in.addr := select.io.enq.bits
......@@ -174,6 +184,7 @@ class ReservationStation
}
if (exuCfg == StExeUnitCfg) {
data.io.fpRegValue := io.fpRegValue
data.io.selData := select.io.deqData.bits
}
data.io.sel := select.io.deq.bits
data.io.listen.wen := ctrl.io.listen
......@@ -196,6 +207,12 @@ class ReservationStation
if (srcNum > 1) { io.deq.bits.src2 := data.io.out(1) }
if (srcNum > 2) { io.deq.bits.src3 := data.io.out(2) }
if (exuCfg == JumpExeUnitCfg) { io.deq.bits.uop.cf.pc := data.io.pc }
if (exuCfg == StExeUnitCfg) {
io.stData.bits.uop := ctrl.io.stData.bits
io.stData.bits.data := data.io.stData
io.stData.valid := ctrl.io.stData.valid
}
}
class ReservationStationSelect
......@@ -225,6 +242,7 @@ class ReservationStationSelect
val redirectVec = Input(Vec(iqSize, Bool()))
val readyVec = Input(Vec(iqSize, Bool()))
val dataReadyVec = Input(Vec(iqSize, Bool())) // NOTE: wanna dead code elimination eliminates the codes
val validVec = Output(Vec(iqSize, Bool()))
val indexVec = Output(Vec(iqSize, UInt(iqIdxWidth.W)))
......@@ -236,6 +254,7 @@ class ReservationStationSelect
def fire() = valid && ready
}
val deq = DecoupledIO(UInt(iqIdxWidth.W))
val deqData = if (exuCfg == StExeUnitCfg) ValidIO(UInt(iqIdxWidth.W)) else null
val flushState = if (feedback) Input(Bool()) else null
val isFirstIssue = if (feedback) Output(Bool()) else null
......@@ -251,7 +270,8 @@ class ReservationStationSelect
* count queue : record replay cycle
*/
val s_idle :: s_valid :: s_wait :: s_replay :: Nil = Enum(4)
val s_idle :: s_valid :: s_wait :: s_replay :: s_sent :: Nil = Enum(5)
val d_idle :: d_sent :: Nil = Enum(2)
/* state machine
* s_idle : empty slot, init state, set when deq
* s_valid : ready to be secleted
......@@ -270,6 +290,11 @@ class ReservationStationSelect
val emptyIdxQueue = widthMap(i => emptyQueue(indexQueue(i)))
val countIdxQueue = widthMap(i => countQueue(indexQueue(i)))
// NOTE: wanna dead code elimination eliminates the below codes
val dataStateQueue = RegInit(VecInit(Seq.fill(iqSize)(d_idle)))
val dataValidQueue = VecInit(dataStateQueue.zip(stateQueue).map(a => a._1 === d_idle && a._2 =/= s_idle))
val dataReadyIdxQueue = widthMap(i => dataValidQueue(indexQueue(i)) && io.dataReadyVec(indexQueue(i)))
// select ready
// for no replay, select just equal to deq (attached)
// with replay, select is just two stage with deq.
......@@ -305,6 +330,19 @@ class ReservationStationSelect
(if(feedback) ~(0.U(iqSize.W)) else
Mux(RegNext(selectValid && (io.redirect.valid || io.flush)), 0.U, ~(0.U(iqSize.W))))
// store deq data, receiver(the sq) must be ready
// NOTE: wanna dead code elimination eliminates the below codes
val lastDataMask = Wire(UInt(iqSize.W))
val dataMask = WireInit(VecInit((0 until iqSize).map(i => dataReadyIdxQueue(i)))).asUInt & lastDataMask
val dataIdx = ParallelPriorityMux(dataMask.asBools zip indexQueue)
val dataPtr = ParallelPriorityMux(dataMask.asBools.zipWithIndex.map{ case (a,i) => (a, i.U)}) // NOTE: the idx of indexQueue
val haveData = Cat(dataMask).orR
val dataIdxReg = RegNext(dataIdx, init = 0.U)
val dataValid = haveData
val dataReg = RegNext(dataValid, init = false.B)
val dataPtrReg = RegNext(Mux(moveMask(dataPtr), dataPtr-1.U, dataPtr), init = 0.U)
lastDataMask := ~Mux(dataReg, UIntToOH(dataPtrReg), 0.U)
// deq
val dequeue = Mux(RegNext(io.flush), false.B,
if (feedback) bubbleReg else bubbleReg || issueFire)
......@@ -327,11 +365,28 @@ class ReservationStationSelect
if (feedback) {
when (io.memfeedback.valid) {
when (stateQueue(io.memfeedback.bits.rsIdx) === s_wait) {
stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_idle, s_replay)
val s_finish_state = if (exuCfg == StExeUnitCfg) {
Mux(dataStateQueue(io.memfeedback.bits.rsIdx) === d_sent || (dataReg && dataIdxReg === io.memfeedback.bits.rsIdx),
s_idle, s_sent)
} else { s_idle }
stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_finish_state, s_replay)
}
when (!io.memfeedback.bits.hit) {
countQueue(io.memfeedback.bits.rsIdx) := replayDelay(cntCountQueue(io.memfeedback.bits.rsIdx))
}
assert(stateQueue(io.memfeedback.bits.rsIdx) === s_wait, "mem feedback but rs dont wait for it")
}
}
if (exuCfg == StExeUnitCfg) {
when (dataReg) {
dataStateQueue(dataIdxReg) := d_sent
}
when (dataReg && stateQueue(dataIdxReg) === s_sent) {
stateQueue(dataIdxReg) := s_idle
}
for (i <- 0 until iqSize) {
assert(stateQueue(i) =/= s_sent || dataStateQueue(i) =/= d_sent, "dont want the state that addr and data both sent, but still not idle")
}
}
......@@ -383,6 +438,7 @@ class ReservationStationSelect
val enqIdx = indexQueue(enqPtr)
when (enqueue) {
stateQueue(enqIdx) := s_valid
dataStateQueue(enqIdx) := d_idle
cntCountQueue(enqIdx) := 0.U
}
......@@ -394,6 +450,11 @@ class ReservationStationSelect
io.deq.valid := selectValid
io.deq.bits := selectIndex
if (exuCfg == StExeUnitCfg) {
io.deqData.valid := dataValid
io.deqData.bits := dataIdx
}
io.numExist := RegNext(Mux(nextTailPtr.flag, if(isPow2(iqSize)) (iqSize-1).U else iqSize.U, nextTailPtr.value), init = (iqSize - 1).U)
assert(RegNext(Mux(tailPtr.flag, tailPtr.value===0.U, true.B)))
......@@ -463,10 +524,13 @@ class ReservationStationCtrl
val uop = new MicroOp
}))
val sel = Flipped(ValidIO(UInt(iqIdxWidth.W)))
val selData = if (exuCfg == StExeUnitCfg) Flipped(ValidIO(UInt(iqIdxWidth.W))) else null
val out = ValidIO(new MicroOp)
val stData = if (exuCfg == StExeUnitCfg) ValidIO(new MicroOp) else null
val redirectVec = Output(Vec(iqSize, Bool()))
val readyVec = Output(Vec(iqSize, Bool()))
val dataReadyVec = if (exuCfg == StExeUnitCfg) Output(Vec(IssQueSize, Bool())) else null
val validVec = Input(Vec(iqSize, Bool()))
val indexVec = Input(Vec(iqSize, UInt(iqIdxWidth.W)))
......@@ -486,7 +550,6 @@ class ReservationStationCtrl
val enqEn = io.in.valid
val enqEnReg = RegNext(enqEn && !(io.redirect.valid || io.flush), init = false.B)
val enqUop = io.in.bits.uop
val enqUopReg = RegEnable(enqUop, selValid)
val selPtr = io.sel.bits
val selPtrReg = RegEnable(selPtr, selValid)
val data = io.listen
......@@ -547,7 +610,12 @@ class ReservationStationCtrl
}
// load wait store
io.readyVec := srcQueueWire.map(Cat(_).andR)
if (exuCfg == StExeUnitCfg) {
io.readyVec := srcQueueWire.map(a => a(0))
io.dataReadyVec := srcQueueWire.map(a => a(1))
} else {
io.readyVec := srcQueueWire.map(Cat(_).andR)
}
if (exuCfg == LdExeUnitCfg) {
val ldWait = Reg(Vec(iqSize, Bool()))
val sqIdx = Reg(Vec(iqSize, new SqPtr()))
......@@ -566,7 +634,7 @@ class ReservationStationCtrl
}
val redirectHit = io.redirectVec(selPtr)
val uop = Module(new SyncDataModuleTemplate(new MicroOp, iqSize, 1, 1))
val uop = Module(new SyncDataModuleTemplate(new MicroOp, iqSize, if (exuCfg == StExeUnitCfg) 2 else 1, 1))
uop.io.raddr(0) := selPtr
io.out.valid := RegNext(selValid && ~redirectHit)
......@@ -575,7 +643,14 @@ class ReservationStationCtrl
uop.io.waddr(0) := enqPtr
uop.io.wdata(0) := enqUop
class fastSendUop extends Bundle {
if (exuCfg == StExeUnitCfg) { // NOTE: send data part of st
uop.io.raddr(1) := io.selData.bits
io.stData.bits := uop.io.rdata(1)
io.stData.valid := RegNext(io.selData.valid && ~io.redirectVec(io.selData.bits))
}
// NOTE: st dont fast wake others, dont care override
class fastSendUop extends XSBundle {
val pdest = UInt(PhyRegIdxWidth.W)
val rfWen = Bool()
val fpWen = Bool()
......@@ -595,6 +670,9 @@ class ReservationStationCtrl
red := roq.needFlush(io.redirect, io.flush)
}
io.out.bits.roqIdx := roqIdx(selPtrReg)
if (exuCfg == StExeUnitCfg) {
io.stData.bits.roqIdx := roqIdx(RegEnable(io.selData.bits, io.selData.valid))
}
io.fastUopOut := DontCare
if (fastWakeup) {
......@@ -790,7 +868,10 @@ class ReservationStationData
}
val sel = Input(UInt(iqIdxWidth.W))
val selData = if(exuCfg == StExeUnitCfg) Input(UInt(iqIdxWidth.W)) else null
val out = Output(Vec(srcNum, UInt(srcLen.W)))
val stData = if(exuCfg == StExeUnitCfg) Output(UInt(srcLen.W)) else null
val pc = if(exuCfg == JumpExeUnitCfg) Output(UInt(VAddrBits.W)) else null
})
......@@ -870,8 +951,18 @@ class ReservationStationData
(0 until srcNum).foreach(i => data(i).w(0).wdata := io.srcRegValue(i) )
}
// deq
data.map(_.r.addr := io.sel)
if (exuCfg == StExeUnitCfg) {
data(0).r.addr := io.sel
data(1).r.addr := io.selData
io.stData := data(1).r.rdata
} else {
data.map(_.r.addr := io.sel)
}
io.out := data.map(_.r.rdata)
if (exuCfg == StExeUnitCfg) {
io.out(1) := DontCare
}
if(pcMem.nonEmpty){
pcMem.get.io.raddr(0) := io.sel
io.pc := pcMem.get.io.rdata(0)
......
......@@ -48,6 +48,7 @@ class RoqLsqIO(implicit p: Parameters) extends XSBundle {
val pendingld = Output(Bool())
val pendingst = Output(Bool())
val commit = Output(Bool())
val storeDataRoqWb = Input(Vec(StorePipelineWidth, Valid(new RoqPtr)))
}
class RoqEnqIO(implicit p: Parameters) extends XSBundle {
......@@ -266,6 +267,7 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc
// writeback status
// val writebacked = Reg(Vec(RoqSize, Bool()))
val writebacked = Mem(RoqSize, Bool())
val store_data_writebacked = Mem(RoqSize, Bool())
// data for redirect, exception, etc.
// val flagBkup = RegInit(VecInit(List.fill(RoqSize)(false.B)))
val flagBkup = Mem(RoqSize, Bool())
......@@ -460,7 +462,8 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc
io.commits.isWalk := state =/= s_idle
val commit_v = Mux(state === s_idle, VecInit(deqPtrVec.map(ptr => valid(ptr.value))), VecInit(walkPtrVec.map(ptr => valid(ptr.value))))
val commit_w = VecInit(deqPtrVec.map(ptr => writebacked(ptr.value)))
// store will be commited iff both sta & std have been writebacked
val commit_w = VecInit(deqPtrVec.map(ptr => writebacked(ptr.value) && store_data_writebacked(ptr.value)))
val commit_exception = exceptionDataRead.valid && !isAfter(exceptionDataRead.bits.roqIdx, deqPtrVec.last)
val commit_block = VecInit((0 until CommitWidth).map(i => !commit_w(i)))
val allowOnlyOneCommit = commit_exception || intrBitSetReg
......@@ -655,11 +658,14 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc
for (i <- 0 until RenameWidth) {
when (canEnqueue(i)) {
writebacked(enqPtrVec(i).value) := false.B
val isStu = io.enq.req(i).bits.ctrl.fuType === FuType.stu
store_data_writebacked(enqPtrVec(i).value) := !isStu
}
}
when (exceptionGen.io.out.valid) {
val wbIdx = exceptionGen.io.out.bits.roqIdx.value
writebacked(wbIdx) := true.B
store_data_writebacked(wbIdx) := true.B
}
// writeback logic set numWbPorts writebacked to true
for (i <- 0 until numWbPorts) {
......@@ -669,6 +675,12 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc
writebacked(wbIdx) := !block_wb
}
}
// store data writeback logic mark store as data_writebacked
for (i <- 0 until StorePipelineWidth) {
when(io.lsq.storeDataRoqWb(i).valid) {
store_data_writebacked(io.lsq.storeDataRoqWb(i).bits.value) := true.B
}
}
// flagBkup
// enqueue logic set 6 flagBkup at most
......
......@@ -50,6 +50,11 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundle {
val forwardData = Vec(8, UInt(8.W))
}
class StoreDataBundle(implicit p: Parameters) extends XSBundle {
val data = UInt((XLEN+1).W)
val uop = new MicroOp
}
class LoadForwardQueryIO(implicit p: Parameters) extends XSBundle {
val paddr = Output(UInt(PAddrBits.W))
val mask = Output(UInt(8.W))
......@@ -62,9 +67,17 @@ class LoadForwardQueryIO(implicit p: Parameters) extends XSBundle {
// val lqIdx = Output(UInt(LoadQueueIdxWidth.W))
val sqIdx = Output(new SqPtr)
val dataInvalid = Input(Bool()) // Addr match, but data is not valid for now
// If dataInvalid, load inst should sleep for a while
// Feedback type should be RSFeedbackType.dataInvalid
}
class MaskedLoadForwardQueryIO(implicit p: Parameters) extends XSBundle {
// LoadForwardQueryIO used in load pipeline
//
// Difference between PipeLoadForwardQueryIO and LoadForwardQueryIO:
// PipeIO use predecoded sqIdxMask for better forward timing
class PipeLoadForwardQueryIO(implicit p: Parameters) extends XSBundle {
val paddr = Output(UInt(PAddrBits.W))
val mask = Output(UInt(8.W))
val uop = Output(new MicroOp) // for replay
......@@ -74,7 +87,13 @@ class MaskedLoadForwardQueryIO(implicit p: Parameters) extends XSBundle {
val forwardMask = Input(Vec(8, Bool()))
val forwardData = Input(Vec(8, UInt(8.W)))
val sqIdx = Output(new SqPtr) // for debug
val sqIdx = Output(new SqPtr) // for debug, should not be used in pipeline for timing reasons
// sqIdxMask is calcuated in earlier stage for better timing
val sqIdxMask = Output(UInt(StoreQueueSize.W))
// dataInvalid: addr match, but data is not valid for now
val dataInvalidFast = Input(Bool()) // resp to load_s1
val dataInvalid = Input(Bool()) // resp to load_s2
// If dataInvalid, load inst should sleep for a while
// Feedback type should be RSFeedbackType.dataInvalid
}
......@@ -42,12 +42,13 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
val flush = Input(Bool())
val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val storeDataIn = Vec(StorePipelineWidth, Flipped(Valid(new StoreDataBundle))) // store data, send to sq from rs
val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool()))
val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool()))
val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq))
val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load
val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store
val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO))
val forward = Vec(LoadPipelineWidth, Flipped(new PipeLoadForwardQueryIO))
val roq = Flipped(new RoqLsqIO)
val rollback = Output(Valid(new Redirect))
val dcache = Flipped(ValidIO(new Refill))
......@@ -101,6 +102,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
storeQueue.io.brqRedirect <> io.brqRedirect
storeQueue.io.flush <> io.flush
storeQueue.io.storeIn <> io.storeIn
storeQueue.io.storeDataIn <> io.storeDataIn
storeQueue.io.sbuffer <> io.sbuffer
storeQueue.io.mmioStout <> io.mmioStout
storeQueue.io.roq <> io.roq
......
......@@ -75,7 +75,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool()))
val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool()))
val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load
val load_s1 = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO))
val load_s1 = Vec(LoadPipelineWidth, Flipped(new PipeLoadForwardQueryIO))
val roq = Flipped(new RoqLsqIO)
val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store
val dcache = Flipped(ValidIO(new Refill))
......@@ -644,6 +644,11 @@ class LoadQueue(implicit p: Parameters) extends XSModule
allowEnqueue := validCount + enqNumber <= (LoadQueueSize - RenameWidth).U
/**
* misc
*/
io.roq.storeDataRoqWb := DontCare // will be overwriten by store queue's result
// perf counter
QueuePerf(LoadQueueSize, validCount, !allowEnqueue)
io.lqFull := !allowEnqueue
......
......@@ -7,7 +7,7 @@ import utils._
import xiangshan._
import xiangshan.cache._
import xiangshan.cache.{DCacheWordIO, DCacheLineIO, TlbRequestIO, MemoryOpConstants}
import xiangshan.backend.roq.RoqLsqIO
import xiangshan.backend.roq.{RoqLsqIO, RoqPtr}
import difftest._
class SqPtr(implicit p: Parameters) extends CircularQueuePtr[SqPtr](
......@@ -39,24 +39,25 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
val enq = new SqEnqIO
val brqRedirect = Flipped(ValidIO(new Redirect))
val flush = Input(Bool())
val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq))
val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) // store addr, data is not included
val storeDataIn = Vec(StorePipelineWidth, Flipped(Valid(new StoreDataBundle))) // store data, send to sq from rs
val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) // write commited store to sbuffer
val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store
val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO))
val forward = Vec(LoadPipelineWidth, Flipped(new PipeLoadForwardQueryIO))
val roq = Flipped(new RoqLsqIO)
val uncache = new DCacheWordIO
// val refill = Flipped(Valid(new DCacheLineReq ))
val exceptionAddr = new ExceptionAddrIO
val sqempty = Output(Bool())
val issuePtrExt = Output(new SqPtr)
val storeIssue = Vec(StorePipelineWidth, Flipped(Valid(new ExuInput)))
val issuePtrExt = Output(new SqPtr) // used to wake up delayed load/store
val storeIssue = Vec(StorePipelineWidth, Flipped(Valid(new ExuInput))) // used to update issuePtrExt
val sqFull = Output(Bool())
})
// data modules
val uop = Reg(Vec(StoreQueueSize, new MicroOp))
// val data = Reg(Vec(StoreQueueSize, new LsqEntry))
val dataModule = Module(new StoreQueueData(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth))
val dataModule = Module(new SQDataModule(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth))
dataModule.io := DontCare
val paddrModule = Module(new SQPaddrModule(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth))
paddrModule.io := DontCare
......@@ -65,8 +66,9 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
// state & misc
val allocated = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // sq entry has been allocated
val addrvalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // non-mmio addr is valid
val datavalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // non-mmio data is valid
val writebacked = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // inst has been writebacked to CDB
val allvalid = VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && datavalid(i))) // non-mmio data & addr is valid
val issued = Reg(Vec(StoreQueueSize, Bool())) // inst has been issued by rs
val commited = Reg(Vec(StoreQueueSize, Bool())) // inst has been commited by roq
val pending = Reg(Vec(StoreQueueSize, Bool())) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of roq
......@@ -123,7 +125,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
uop(index) := io.enq.req(i).bits
allocated(index) := true.B
datavalid(index) := false.B
writebacked(index) := false.B
addrvalid(index) := false.B
issued(index) := false.B
commited(index) := false.B
pending(index) := false.B
......@@ -168,7 +170,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
*
* Most store instructions writeback to regfile in the previous cycle.
* However,
* (1) For an mmio instruction with exceptions, we need to mark it as datavalid
* (1) For an mmio instruction with exceptions, we need to mark it as addrvalid
* (in this way it will trigger an exception when it reaches ROB's head)
* instead of pending to avoid sending them to lower level.
* (2) For an mmio instruction without exceptions, we mark it as pending.
......@@ -176,39 +178,33 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
* Upon receiving the response, StoreQueue writes back the instruction
* through arbiter with store units. It will later commit as normal.
*/
// Write addr to sq
for (i <- 0 until StorePipelineWidth) {
dataModule.io.wen(i) := false.B
paddrModule.io.wen(i) := false.B
dataModule.io.mask.wen(i) := false.B
val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value
when (io.storeIn(i).fire()) {
datavalid(stWbIndex) := !io.storeIn(i).bits.mmio
writebacked(stWbIndex) := !io.storeIn(i).bits.mmio
addrvalid(stWbIndex) := true.B//!io.storeIn(i).bits.mmio
pending(stWbIndex) := io.storeIn(i).bits.mmio
val storeWbData = Wire(new SQDataEntry)
storeWbData := DontCare
storeWbData.mask := io.storeIn(i).bits.mask
storeWbData.data := io.storeIn(i).bits.data
dataModule.io.waddr(i) := stWbIndex
dataModule.io.wdata(i) := storeWbData
dataModule.io.wen(i) := true.B
dataModule.io.mask.waddr(i) := stWbIndex
dataModule.io.mask.wdata(i) := io.storeIn(i).bits.mask
dataModule.io.mask.wen(i) := true.B
paddrModule.io.waddr(i) := stWbIndex
paddrModule.io.wdata(i) := io.storeIn(i).bits.paddr
paddrModule.io.wen(i) := true.B
mmio(stWbIndex) := io.storeIn(i).bits.mmio
XSInfo("store write to sq idx %d pc 0x%x vaddr %x paddr %x data %x mmio %x\n",
XSInfo("store addr write to sq idx %d pc 0x%x vaddr %x paddr %x mmio %x\n",
io.storeIn(i).bits.uop.sqIdx.value,
io.storeIn(i).bits.uop.cf.pc,
io.storeIn(i).bits.vaddr,
io.storeIn(i).bits.paddr,
io.storeIn(i).bits.data,
io.storeIn(i).bits.mmio
)
)
}
// vaddrModule write is delayed, as vaddrModule will not be read right after write
vaddrModule.io.waddr(i) := RegNext(stWbIndex)
......@@ -216,6 +212,31 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
vaddrModule.io.wen(i) := RegNext(io.storeIn(i).fire())
}
// Write data to sq
for (i <- 0 until StorePipelineWidth) {
dataModule.io.data.wen(i) := false.B
io.roq.storeDataRoqWb(i).valid := false.B
io.roq.storeDataRoqWb(i).bits := DontCare
val stWbIndex = io.storeDataIn(i).bits.uop.sqIdx.value
when (io.storeDataIn(i).fire()) {
datavalid(stWbIndex) := true.B
dataModule.io.data.waddr(i) := stWbIndex
dataModule.io.data.wdata(i) := genWdata(io.storeDataIn(i).bits.data, io.storeDataIn(i).bits.uop.ctrl.fuOpType(1,0))
dataModule.io.data.wen(i) := true.B
io.roq.storeDataRoqWb(i).valid := true.B
io.roq.storeDataRoqWb(i).bits := io.storeDataIn(i).bits.uop.roqIdx
XSInfo("store data write to sq idx %d pc 0x%x data %x -> %x\n",
io.storeDataIn(i).bits.uop.sqIdx.value,
io.storeDataIn(i).bits.uop.cf.pc,
io.storeDataIn(i).bits.data,
dataModule.io.data.wdata(i)
)
}
}
/**
* load forward query
*
......@@ -235,25 +256,33 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
// i.e. forward1 is the target entries with the same flag bits and forward2 otherwise
val differentFlag = deqPtrExt(0).flag =/= io.forward(i).sqIdx.flag
val forwardMask = io.forward(i).sqIdxMask
val storeWritebackedVec = WireInit(VecInit(Seq.fill(StoreQueueSize)(false.B)))
for (j <- 0 until StoreQueueSize) {
storeWritebackedVec(j) := datavalid(j) && allocated(j) // all datavalid terms need to be checked
}
val needForward1 = Mux(differentFlag, ~deqMask, deqMask ^ forwardMask) & storeWritebackedVec.asUInt
val needForward2 = Mux(differentFlag, forwardMask, 0.U(StoreQueueSize.W)) & storeWritebackedVec.asUInt
XSDebug(p"$i f1 ${Binary(needForward1)} f2 ${Binary(needForward2)} " +
// all addrvalid terms need to be checked
val addrValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && allocated(i))))
val dataValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => datavalid(i))))
val allValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && datavalid(i) && allocated(i))))
val canForward1 = Mux(differentFlag, ~deqMask, deqMask ^ forwardMask) & allValidVec.asUInt
val canForward2 = Mux(differentFlag, forwardMask, 0.U(StoreQueueSize.W)) & allValidVec.asUInt
val needForward = Mux(differentFlag, ~deqMask | forwardMask, deqMask ^ forwardMask)
XSDebug(p"$i f1 ${Binary(canForward1)} f2 ${Binary(canForward2)} " +
p"sqIdx ${io.forward(i).sqIdx} pa ${Hexadecimal(io.forward(i).paddr)}\n"
)
// do real fwd query
dataModule.io.needForward(i)(0) := needForward1 & paddrModule.io.forwardMmask(i).asUInt
dataModule.io.needForward(i)(1) := needForward2 & paddrModule.io.forwardMmask(i).asUInt
// do real fwd query (cam lookup in load_s1)
dataModule.io.needForward(i)(0) := canForward1 & paddrModule.io.forwardMmask(i).asUInt
dataModule.io.needForward(i)(1) := canForward2 & paddrModule.io.forwardMmask(i).asUInt
paddrModule.io.forwardMdata(i) := io.forward(i).paddr
// Forward result will be generated 1 cycle later (load_s2)
io.forward(i).forwardMask := dataModule.io.forwardMask(i)
io.forward(i).forwardData := dataModule.io.forwardData(i)
// If addr match, data not ready, mark it as dataInvalid
// load_s1: generate dataInvalid in load_s1 to set fastUop to
io.forward(i).dataInvalidFast := (addrValidVec.asUInt & ~dataValidVec.asUInt & paddrModule.io.forwardMmask(i).asUInt & needForward).orR
// load_s2
io.forward(i).dataInvalid := RegNext(io.forward(i).dataInvalidFast)
}
/**
......@@ -262,7 +291,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
* States:
* (1) writeback from store units: mark as pending
* (2) when they reach ROB's head, they can be sent to uncache channel
* (3) response from uncache channel: mark as datavalid
* (3) response from uncache channel: mark as datavalidmask.wen
* (4) writeback to ROB (and other units): mark as writebacked
* (5) ROB commits the instruction: same as normal instructions
*/
......@@ -271,7 +300,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
val uncacheState = RegInit(s_idle)
switch(uncacheState) {
is(s_idle) {
when(io.roq.pendingst && pending(deqPtr) && allocated(deqPtr)) {
when(io.roq.pendingst && pending(deqPtr) && allocated(deqPtr) && datavalid(deqPtr) && addrvalid(deqPtr)) {
uncacheState := s_req
}
}
......@@ -306,6 +335,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
io.uncache.req.bits.id := DontCare
when(io.uncache.req.fire()){
// mmio store should not be committed until uncache req is sent
pending(deqPtr) := false.B
XSDebug(
......@@ -319,12 +349,9 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
// (3) response from uncache channel: mark as datavalid
io.uncache.resp.ready := true.B
when (io.uncache.resp.fire()) {
datavalid(deqPtr) := true.B
}
// (4) writeback to ROB (and other units): mark as writebacked
io.mmioStout.valid := uncacheState === s_wb // allocated(deqPtr) && datavalid(deqPtr) && !writebacked(deqPtr)
io.mmioStout.valid := uncacheState === s_wb
io.mmioStout.bits.uop := uop(deqPtr)
io.mmioStout.bits.uop.sqIdx := deqPtrExt(0)
io.mmioStout.bits.data := dataModule.io.rdata(0).data // dataModule.io.rdata.read(deqPtr)
......@@ -335,7 +362,6 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
io.mmioStout.bits.debug.isPerfCnt := false.B
io.mmioStout.bits.fflags := DontCare
when (io.mmioStout.fire()) {
writebacked(deqPtr) := true.B
allocated(deqPtr) := false.B
}
......@@ -360,6 +386,8 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
// if !sbuffer.fire(), read the same ptr
// if sbuffer.fire(), read next
io.sbuffer(i).valid := allocated(ptr) && commited(ptr) && !mmio(ptr)
// Note that store data/addr should both be valid after store's commit
assert(!io.sbuffer(i).valid || allvalid(ptr))
io.sbuffer(i).bits.cmd := MemoryOpConstants.M_XWR
io.sbuffer(i).bits.addr := paddrModule.io.rdata(i)
io.sbuffer(i).bits.data := dataModule.io.rdata(i).data
......@@ -460,10 +488,11 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
if (i % 4 == 0) XSDebug("")
XSDebug(false, true.B, "%x ", uop(i).cf.pc)
PrintFlag(allocated(i), "a")
PrintFlag(allocated(i) && datavalid(i), "v")
PrintFlag(allocated(i) && writebacked(i), "w")
PrintFlag(allocated(i) && addrvalid(i), "a")
PrintFlag(allocated(i) && datavalid(i), "d")
PrintFlag(allocated(i) && commited(i), "c")
PrintFlag(allocated(i) && pending(i), "p")
PrintFlag(allocated(i) && mmio(i), "m")
XSDebug(false, true.B, " ")
if (i % 4 == 3 || i == StoreQueueSize - 1) XSDebug(false, true.B, "\n")
}
......
......@@ -63,9 +63,16 @@ class SQData8Module(size: Int, numRead: Int, numWrite: Int, numForward: Int)(imp
val io = IO(new Bundle() {
val raddr = Vec(numRead, Input(UInt(log2Up(size).W)))
val rdata = Vec(numRead, Output(new SQData8Entry))
val wen = Vec(numWrite, Input(Bool()))
val waddr = Vec(numWrite, Input(UInt(log2Up(size).W)))
val wdata = Vec(numWrite, Input(new SQData8Entry))
val data = new Bundle() {
val wen = Vec(numWrite, Input(Bool()))
val waddr = Vec(numWrite, Input(UInt(log2Up(size).W)))
val wdata = Vec(numWrite, Input(UInt((XLEN/8).W)))
}
val mask = new Bundle() {
val wen = Vec(numWrite, Input(Bool()))
val waddr = Vec(numWrite, Input(UInt(log2Up(size).W)))
val wdata = Vec(numWrite, Input(Bool()))
}
val needForward = Input(Vec(numForward, Vec(2, UInt(size.W))))
val forwardValid = Vec(numForward, Output(Bool()))
......@@ -76,10 +83,15 @@ class SQData8Module(size: Int, numRead: Int, numWrite: Int, numForward: Int)(imp
val data = Reg(Vec(size, new SQData8Entry))
// writeback to lq/sq
// writeback to sq
(0 until numWrite).map(i => {
when(io.wen(i)){
data(io.waddr(i)) := io.wdata(i)
when(io.data.wen(i)){
data(io.data.waddr(i)).data := io.data.wdata(i)
}
})
(0 until numWrite).map(i => {
when(io.mask.wen(i)){
data(io.mask.waddr(i)).valid := io.mask.wdata(i)
}
})
......@@ -91,7 +103,12 @@ class SQData8Module(size: Int, numRead: Int, numWrite: Int, numForward: Int)(imp
// DataModuleTemplate should not be used when there're any write conflicts
for (i <- 0 until numWrite) {
for (j <- i+1 until numWrite) {
assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
assert(!(io.data.wen(i) && io.data.wen(j) && io.data.waddr(i) === io.data.waddr(j)))
}
}
for (i <- 0 until numWrite) {
for (j <- i+1 until numWrite) {
assert(!(io.mask.wen(i) && io.mask.wen(j) && io.mask.waddr(i) === io.mask.waddr(j)))
}
}
......@@ -150,13 +167,20 @@ class SQDataEntry(implicit p: Parameters) extends XSBundle {
val data = UInt(XLEN.W)
}
class StoreQueueData(size: Int, numRead: Int, numWrite: Int, numForward: Int)(implicit p: Parameters) extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper {
class SQDataModule(size: Int, numRead: Int, numWrite: Int, numForward: Int)(implicit p: Parameters) extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper {
val io = IO(new Bundle() {
val raddr = Vec(numRead, Input(UInt(log2Up(size).W)))
val rdata = Vec(numRead, Output(new SQDataEntry))
val wen = Vec(numWrite, Input(Bool()))
val waddr = Vec(numWrite, Input(UInt(log2Up(size).W)))
val wdata = Vec(numWrite, Input(new SQDataEntry))
val data = new Bundle() {
val wen = Vec(numWrite, Input(Bool()))
val waddr = Vec(numWrite, Input(UInt(log2Up(size).W)))
val wdata = Vec(numWrite, Input(UInt(XLEN.W)))
}
val mask = new Bundle() {
val wen = Vec(numWrite, Input(Bool()))
val waddr = Vec(numWrite, Input(UInt(log2Up(size).W)))
val wdata = Vec(numWrite, Input(UInt(8.W)))
}
val needForward = Input(Vec(numForward, Vec(2, UInt(size.W))))
val forwardMask = Vec(numForward, Output(Vec(8, Bool())))
......@@ -169,10 +193,12 @@ class StoreQueueData(size: Int, numRead: Int, numWrite: Int, numForward: Int)(im
for (i <- 0 until numWrite) {
// write to data8
for (j <- 0 until 8) {
data8(j).io.waddr(i) := io.waddr(i)
data8(j).io.wdata(i).valid := io.wdata(i).mask(j)
data8(j).io.wdata(i).data := io.wdata(i).data(8*(j+1)-1, 8*j)
data8(j).io.wen(i) := io.wen(i)
data8(j).io.mask.waddr(i) := io.mask.waddr(i)
data8(j).io.mask.wdata(i) := io.mask.wdata(i)(j)
data8(j).io.mask.wen(i) := io.mask.wen(i)
data8(j).io.data.waddr(i) := io.data.waddr(i)
data8(j).io.data.wdata(i) := io.data.wdata(i)(8*(j+1)-1, 8*j)
data8(j).io.data.wen(i) := io.data.wen(i)
}
}
......@@ -188,7 +214,12 @@ class StoreQueueData(size: Int, numRead: Int, numWrite: Int, numForward: Int)(im
// DataModuleTemplate should not be used when there're any write conflicts
for (i <- 0 until numWrite) {
for (j <- i+1 until numWrite) {
assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
assert(!(io.data.wen(i) && io.data.wen(j) && io.data.waddr(i) === io.data.waddr(j)))
}
}
for (i <- 0 until numWrite) {
for (j <- i+1 until numWrite) {
assert(!(io.mask.wen(i) && io.mask.wen(j) && io.mask.waddr(i) === io.mask.waddr(j)))
}
}
......
......@@ -11,12 +11,13 @@ import difftest._
class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstants{
val io = IO(new Bundle() {
val in = Flipped(Decoupled(new ExuInput))
val storeDataIn = Flipped(Valid(new StoreDataBundle)) // src2 from rs
val out = Decoupled(new ExuOutput)
val dcache = new DCacheWordIO
val dtlb = new TlbRequestIO
val rsIdx = Input(UInt(log2Up(IssQueSize).W))
val flush_sbuffer = new SbufferFlushBundle
val tlbFeedback = ValidIO(new TlbFeedback)
val rsFeedback = ValidIO(new RSFeedback)
val redirect = Flipped(ValidIO(new Redirect))
val flush = Input(Bool())
val exceptionAddr = ValidIO(UInt(VAddrBits.W))
......@@ -27,6 +28,8 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant
//-------------------------------------------------------
val s_invalid :: s_tlb :: s_flush_sbuffer_req :: s_flush_sbuffer_resp :: s_cache_req :: s_cache_resp :: s_finish :: Nil = Enum(7)
val state = RegInit(s_invalid)
val addr_valid = RegInit(false.B)
val data_valid = RegInit(false.B)
val in = Reg(new ExuInput())
val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec()))
val atom_override_xtval = RegInit(false.B)
......@@ -68,18 +71,30 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant
io.in.ready := true.B
when (io.in.fire()) {
in := io.in.bits
in.src2 := in.src2 // leave src2 unchanged
addr_valid := true.B
}
when (io.storeDataIn.fire()) {
in.src2 := io.storeDataIn.bits.data
data_valid := true.B
}
when(data_valid && addr_valid) {
state := s_tlb
addr_valid := false.B
data_valid := false.B
}
}
// Send TLB feedback to store issue queue
// we send feedback right after we receives request
// also, we always treat amo as tlb hit
// since we will continue polling tlb all by ourself
io.tlbFeedback.valid := RegNext(RegNext(io.in.valid))
io.tlbFeedback.bits.hit := true.B
io.tlbFeedback.bits.rsIdx := RegEnable(io.rsIdx, io.in.valid)
io.tlbFeedback.bits.flushState := DontCare
io.rsFeedback.valid := RegNext(RegNext(io.in.valid))
io.rsFeedback.bits.hit := true.B
io.rsFeedback.bits.rsIdx := RegEnable(io.rsIdx, io.in.valid)
io.rsFeedback.bits.flushState := DontCare
io.rsFeedback.bits.sourceType := DontCare
// tlb translation, manipulating signals && deal with exception
when (state === s_tlb) {
......
......@@ -13,7 +13,7 @@ class LoadToLsqIO(implicit p: Parameters) extends XSBundle {
val ldout = Flipped(DecoupledIO(new ExuOutput))
val loadDataForwarded = Output(Bool())
val needReplayFromRS = Output(Bool())
val forward = new MaskedLoadForwardQueryIO
val forward = new PipeLoadForwardQueryIO
}
// Load Pipeline Stage 0
......@@ -99,7 +99,7 @@ class LoadUnit_S1(implicit p: Parameters) extends XSModule {
val dcachePAddr = Output(UInt(PAddrBits.W))
val dcacheKill = Output(Bool())
val sbuffer = new LoadForwardQueryIO
val lsq = new MaskedLoadForwardQueryIO
val lsq = new PipeLoadForwardQueryIO
})
val s1_uop = io.in.bits.uop
......@@ -156,7 +156,7 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
val io = IO(new Bundle() {
val in = Flipped(Decoupled(new LsPipelineBundle))
val out = Decoupled(new LsPipelineBundle)
val tlbFeedback = ValidIO(new TlbFeedback)
val rsFeedback = ValidIO(new RSFeedback)
val dcacheResp = Flipped(DecoupledIO(new DCacheWordResp))
val lsq = new LoadForwardQueryIO
val sbuffer = new LoadForwardQueryIO
......@@ -168,6 +168,7 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
val s2_mask = io.in.bits.mask
val s2_paddr = io.in.bits.paddr
val s2_tlb_miss = io.in.bits.tlbMiss
val s2_data_invalid = io.lsq.dataInvalid
val s2_exception = selectLoad(io.in.bits.uop.cf.exceptionVec, false).asUInt.orR
val s2_mmio = io.in.bits.mmio && !s2_exception
val s2_cache_miss = io.dcacheResp.bits.miss
......@@ -178,10 +179,18 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
assert(!(io.in.valid && dcacheShouldResp && !io.dcacheResp.valid), "DCache response got lost")
// feedback tlb result to RS
io.tlbFeedback.valid := io.in.valid
io.tlbFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception)
io.tlbFeedback.bits.rsIdx := io.in.bits.rsIdx
io.tlbFeedback.bits.flushState := io.in.bits.ptwBack
io.rsFeedback.valid := io.in.valid
io.rsFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception) && !s2_data_invalid
io.rsFeedback.bits.rsIdx := io.in.bits.rsIdx
io.rsFeedback.bits.flushState := io.in.bits.ptwBack
io.rsFeedback.bits.sourceType := Mux(s2_tlb_miss, RSFeedbackType.tlbMiss,
Mux(io.lsq.dataInvalid,
RSFeedbackType.dataInvalid,
RSFeedbackType.mshrFull
)
)
// s2_cache_replay is quite slow to generate, send it separately to LQ
io.needReplayFromRS := s2_cache_replay
// merge forward result
......@@ -189,7 +198,7 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
val forwardMask = Wire(Vec(8, Bool()))
val forwardData = Wire(Vec(8, UInt(8.W)))
val fullForward = (~forwardMask.asUInt & s2_mask) === 0.U
val fullForward = (~forwardMask.asUInt & s2_mask) === 0.U && !io.lsq.dataInvalid
io.lsq := DontCare
io.sbuffer := DontCare
......@@ -221,7 +230,7 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
))
val rdataPartialLoad = rdataHelper(s2_uop, rdataSel)
io.out.valid := io.in.valid && !s2_tlb_miss
io.out.valid := io.in.valid && !s2_tlb_miss && !s2_data_invalid
// Inst will be canceled in store queue / lsq,
// so we do not need to care about flush in load / store unit's out.valid
io.out.bits := io.in.bits
......@@ -253,9 +262,9 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
XSPerfAccumulate("dcache_miss", io.in.valid && s2_cache_miss)
XSPerfAccumulate("full_forward", io.in.valid && fullForward)
XSPerfAccumulate("dcache_miss_full_forward", io.in.valid && s2_cache_miss && fullForward)
XSPerfAccumulate("replay", io.tlbFeedback.valid && !io.tlbFeedback.bits.hit)
XSPerfAccumulate("replay_tlb_miss", io.tlbFeedback.valid && !io.tlbFeedback.bits.hit && s2_tlb_miss)
XSPerfAccumulate("replay_cache", io.tlbFeedback.valid && !io.tlbFeedback.bits.hit && !s2_tlb_miss && s2_cache_replay)
XSPerfAccumulate("replay", io.rsFeedback.valid && !io.rsFeedback.bits.hit)
XSPerfAccumulate("replay_tlb_miss", io.rsFeedback.valid && !io.rsFeedback.bits.hit && s2_tlb_miss)
XSPerfAccumulate("replay_cache", io.rsFeedback.valid && !io.rsFeedback.bits.hit && !s2_tlb_miss && s2_cache_replay)
XSPerfAccumulate("stall_out", io.out.valid && !io.out.ready)
}
......@@ -265,7 +274,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper {
val ldout = Decoupled(new ExuOutput)
val redirect = Flipped(ValidIO(new Redirect))
val flush = Input(Bool())
val tlbFeedback = ValidIO(new TlbFeedback)
val rsFeedback = ValidIO(new RSFeedback)
val rsIdx = Input(UInt(log2Up(IssQueSize).W))
val isFirstIssue = Input(Bool())
val dcache = new DCacheLoadIO
......@@ -298,11 +307,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper {
load_s2.io.dcacheResp <> io.dcache.resp
load_s2.io.lsq.forwardData <> io.lsq.forward.forwardData
load_s2.io.lsq.forwardMask <> io.lsq.forward.forwardMask
load_s2.io.lsq.dataInvalid <> io.lsq.forward.dataInvalid
load_s2.io.sbuffer.forwardData <> io.sbuffer.forwardData
load_s2.io.sbuffer.forwardMask <> io.sbuffer.forwardMask
load_s2.io.sbuffer.dataInvalid <> io.sbuffer.dataInvalid // always false
load_s2.io.dataForwarded <> io.lsq.loadDataForwarded
io.tlbFeedback.bits := RegNext(load_s2.io.tlbFeedback.bits)
io.tlbFeedback.valid := RegNext(load_s2.io.tlbFeedback.valid && !load_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush))
io.rsFeedback.bits := RegNext(load_s2.io.rsFeedback.bits)
io.rsFeedback.valid := RegNext(load_s2.io.rsFeedback.valid && !load_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush))
io.lsq.needReplayFromRS := load_s2.io.needReplayFromRS
// pre-calcuate sqIdx mask in s0, then send it to lsq in s1 for forwarding
......@@ -313,7 +324,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper {
// load_s2.io.dcacheResp.bits.data := Mux1H(RegNext(io.dcache.s1_hit_way), RegNext(io.dcache.s1_data))
// assert(load_s2.io.dcacheResp.bits.data === io.dcache.resp.bits.data)
io.fastUop.valid := io.dcache.s1_hit_way.orR && !io.dcache.s1_disable_fast_wakeup && load_s1.io.in.valid && !load_s1.io.dcacheKill
io.fastUop.valid := io.dcache.s1_hit_way.orR && !io.dcache.s1_disable_fast_wakeup && load_s1.io.in.valid &&
!load_s1.io.dcacheKill && !io.lsq.forward.dataInvalidFast
io.fastUop.bits := load_s1.io.out.bits.uop
XSDebug(load_s0.io.out.valid,
......
......@@ -39,7 +39,9 @@ class StoreUnit_S0(implicit p: Parameters) extends XSModule {
io.out.bits := DontCare
io.out.bits.vaddr := saddr
io.out.bits.data := genWdata(io.in.bits.src2, io.in.bits.uop.ctrl.fuOpType(1,0))
// Now data use its own io
// io.out.bits.data := genWdata(io.in.bits.src2, io.in.bits.uop.ctrl.fuOpType(1,0))
io.out.bits.data := io.in.bits.src2 // FIXME: remove data from pipeline
io.out.bits.uop := io.in.bits.uop
io.out.bits.miss := DontCare
io.out.bits.rsIdx := io.rsIdx
......@@ -70,7 +72,7 @@ class StoreUnit_S1(implicit p: Parameters) extends XSModule {
val out = Decoupled(new LsPipelineBundle)
val lsq = ValidIO(new LsPipelineBundle)
val dtlbResp = Flipped(DecoupledIO(new TlbResp))
val tlbFeedback = ValidIO(new TlbFeedback)
val rsFeedback = ValidIO(new RSFeedback)
})
val s1_paddr = io.dtlbResp.bits.paddr
......@@ -83,14 +85,15 @@ class StoreUnit_S1(implicit p: Parameters) extends XSModule {
io.dtlbResp.ready := true.B // TODO: why dtlbResp needs a ready?
// Send TLB feedback to store issue queue
io.tlbFeedback.valid := io.in.valid
io.tlbFeedback.bits.hit := !s1_tlb_miss
io.tlbFeedback.bits.flushState := io.dtlbResp.bits.ptwBack
io.tlbFeedback.bits.rsIdx := io.in.bits.rsIdx
XSDebug(io.tlbFeedback.valid,
io.rsFeedback.valid := io.in.valid
io.rsFeedback.bits.hit := !s1_tlb_miss
io.rsFeedback.bits.flushState := io.dtlbResp.bits.ptwBack
io.rsFeedback.bits.rsIdx := io.in.bits.rsIdx
io.rsFeedback.bits.sourceType := RSFeedbackType.tlbMiss
XSDebug(io.rsFeedback.valid,
"S1 Store: tlbHit: %d roqIdx: %d\n",
io.tlbFeedback.bits.hit,
io.tlbFeedback.bits.rsIdx
io.rsFeedback.bits.hit,
io.rsFeedback.bits.rsIdx
)
......@@ -146,7 +149,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
val stin = Flipped(Decoupled(new ExuInput))
val redirect = Flipped(ValidIO(new Redirect))
val flush = Input(Bool())
val tlbFeedback = ValidIO(new TlbFeedback)
val rsFeedback = ValidIO(new RSFeedback)
val dtlb = new TlbRequestIO()
val rsIdx = Input(UInt(log2Up(IssQueSize).W))
val isFirstIssue = Input(Bool())
......@@ -168,7 +171,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
store_s1.io.lsq <> io.lsq // send result to sq
store_s1.io.dtlbResp <> io.dtlb.resp
store_s1.io.tlbFeedback <> io.tlbFeedback
store_s1.io.rsFeedback <> io.rsFeedback
PipelineConnect(store_s1.io.out, store_s2.io.in, true.B, store_s1.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush))
......
......@@ -387,6 +387,7 @@ class NewSbuffer(implicit p: Parameters) extends XSModule with HasSbufferConst {
val selectedInflightMask = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedInflightData = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
forward.dataInvalid := false.B // data in store line merge buffer is always ready
for (j <- 0 until DataBytes) {
forward.forwardMask(j) := false.B
forward.forwardData(j) := DontCare
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册