未验证 提交 edd6ddbc 编写于 作者: W wakafa 提交者: GitHub

Add some in-core hardware performance counters (#731)

* csr: remove unused input perfcnt io

* perfcnt: add some in-core hardware performance counters

* perfcnt: optimize timing for hardware performance counters
上级 4f62e33d
......@@ -432,6 +432,11 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer)
integerBlock.io.csrio.hartId <> io.hartId
integerBlock.io.csrio.perf <> DontCare
integerBlock.io.csrio.perf.retiredInstr <> ctrlBlock.io.roqio.toCSR.perfinfo.retiredInstr
integerBlock.io.csrio.perf.bpuInfo <> ctrlBlock.io.perfInfo.bpuInfo
integerBlock.io.csrio.perf.ctrlInfo <> ctrlBlock.io.perfInfo.ctrlInfo
integerBlock.io.csrio.perf.memInfo <> memBlock.io.memInfo
integerBlock.io.csrio.perf.frontendInfo <> frontend.io.frontendInfo
integerBlock.io.csrio.fpu.fflags <> ctrlBlock.io.roqio.toCSR.fflags
integerBlock.io.csrio.fpu.isIllegal := false.B
integerBlock.io.csrio.fpu.dirty_fs <> ctrlBlock.io.roqio.toCSR.dirty_fs
......
......@@ -193,6 +193,18 @@ class CtrlBlock extends XSModule with HasCircularQueuePtrHelper {
val lsq = new RoqLsqIO
}
val csrCtrl = Input(new CustomCSRCtrlIO)
val perfInfo = Output(new Bundle{
val ctrlInfo = new Bundle {
val roqFull = Input(Bool())
val intdqFull = Input(Bool())
val fpdqFull = Input(Bool())
val lsdqFull = Input(Bool())
}
val bpuInfo = new Bundle {
val bpRight = Output(UInt(XLEN.W))
val bpWrong = Output(UInt(XLEN.W))
}
})
})
val difftestIO = IO(new Bundle() {
......@@ -380,8 +392,15 @@ class CtrlBlock extends XSModule with HasCircularQueuePtrHelper {
// roq to int block
io.roqio.toCSR <> roq.io.csr
io.roqio.toCSR.perfinfo.retiredInstr <> RegNext(roq.io.csr.perfinfo.retiredInstr)
io.roqio.exception := roq.io.exception
io.roqio.exception.bits.uop.cf.pc := flushPC
// roq to mem block
io.roqio.lsq <> roq.io.lsq
io.perfInfo.ctrlInfo.roqFull := RegNext(roq.io.roqFull)
io.perfInfo.ctrlInfo.intdqFull := RegNext(dispatch.io.ctrlInfo.intdqFull)
io.perfInfo.ctrlInfo.fpdqFull := RegNext(dispatch.io.ctrlInfo.fpdqFull)
io.perfInfo.ctrlInfo.lsdqFull := RegNext(dispatch.io.ctrlInfo.lsdqFull)
io.perfInfo.bpuInfo <> RegNext(ftq.io.bpuInfo)
}
......@@ -242,6 +242,7 @@ class IntegerBlock
}
jmpExeUnit.csrio <> io.csrio
jmpExeUnit.csrio.perf <> RegNext(io.csrio.perf)
jmpExeUnit.fenceio <> io.fenceio
if (!env.FPGAPlatform) {
jmpExeUnit.difftestIO.fromCSR <> difftestIO.fromCSR
......
......@@ -83,6 +83,11 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val csrCtrl = Flipped(new CustomCSRCtrlIO)
val error = new L1CacheErrorInfo
val memInfo = new Bundle {
val sqFull = Output(Bool())
val lqFull = Output(Bool())
val dcacheMSHRFull = Output(Bool())
}
})
val difftestIO = IO(new Bundle() {
val fromSbuffer = new Bundle() {
......@@ -402,5 +407,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
lsq.io.exceptionAddr.lsIdx := io.lsqio.exceptionAddr.lsIdx
lsq.io.exceptionAddr.isStore := io.lsqio.exceptionAddr.isStore
io.lsqio.exceptionAddr.vaddr := Mux(atomicsUnit.io.exceptionAddr.valid, atomicsUnit.io.exceptionAddr.bits, lsq.io.exceptionAddr.vaddr)
io.memInfo.sqFull := RegNext(lsq.io.sqFull)
io.memInfo.lqFull := RegNext(lsq.io.lqFull)
io.memInfo.dcacheMSHRFull := RegNext(dcache.io.mshrFull)
}
......@@ -50,6 +50,12 @@ class Dispatch extends XSModule {
val fpIndex = Vec(exuParameters.FpExuCnt, Output(UInt(log2Ceil((NRFpReadPorts - exuParameters.StuCnt) / 3).W)))
// ls: hardwired to (0, 1, 2, 4)
}
val ctrlInfo = new Bundle {
val roqFull = Output(Bool())
val intdqFull = Output(Bool())
val fpdqFull = Output(Bool())
val lsdqFull = Output(Bool())
}
})
val dispatch1 = Module(new Dispatch1)
......@@ -114,4 +120,9 @@ class Dispatch extends XSModule {
lsDispatch.io.numExist.zipWithIndex.map({case (num, i) => num := io.numExist(exuParameters.IntExuCnt + exuParameters.FpExuCnt + i)})
lsDispatch.io.enqIQCtrl.zipWithIndex.map({case (enq, i) => enq <> io.enqIQCtrl(exuParameters.IntExuCnt + exuParameters.FpExuCnt + i)})
// lsDispatch.io.enqIQData.zipWithIndex.map({case (enq, i) => enq <> io.enqIQData(exuParameters.IntExuCnt + exuParameters.FpExuCnt + i)})
io.ctrlInfo <> DontCare
io.ctrlInfo.intdqFull := intDq.io.dqFull
io.ctrlInfo.fpdqFull := fpDq.io.dqFull
io.ctrlInfo.lsdqFull := lsDq.io.dqFull
}
......@@ -20,6 +20,7 @@ class DispatchQueueIO(enqnum: Int, deqnum: Int) extends XSBundle {
val flush = Input(Bool())
override def cloneType: DispatchQueueIO.this.type =
new DispatchQueueIO(enqnum, deqnum).asInstanceOf[this.type]
val dqFull = Output(Bool())
}
// dispatch queue: accepts at most enqnum uops from dispatch1 and dispatches deqnum uops at every clock cycle
......@@ -204,6 +205,7 @@ class DispatchQueue(size: Int, enqnum: Int, deqnum: Int, name: String) extends X
// XSError(isAfter(headPtr(0), tailPtr(0)), p"assert greaterOrEqualThan(tailPtr: ${tailPtr(0)}, headPtr: ${headPtr(0)}) failed\n")
QueuePerf(size, PopCount(stateEntries.map(_ =/= s_invalid)), !canEnqueue)
io.dqFull := !canEnqueue
XSPerfAccumulate("in", numEnq)
XSPerfAccumulate("out", PopCount(io.deq.map(_.fire())))
XSPerfAccumulate("out_try", PopCount(io.deq.map(_.valid)))
......
......@@ -106,6 +106,10 @@ class Ftq extends XSModule with HasCircularQueuePtrHelper {
// pc read reqs (0: jump/auipc 1~6: mispredict/load replay 7: exceptions)
val ftqRead = Vec(1 + 6 + 1, Flipped(new FtqRead))
val cfiRead = Flipped(new FtqRead)
val bpuInfo = new Bundle {
val bpRight = Output(UInt(XLEN.W))
val bpWrong = Output(UInt(XLEN.W))
}
})
val headPtr, tailPtr = RegInit(FtqPtr(false.B, 0.U))
......@@ -298,11 +302,12 @@ class Ftq extends XSModule with HasCircularQueuePtrHelper {
XSPerfAccumulate("mispredictRedirect", io.redirect.valid && RedirectLevel.flushAfter === io.redirect.bits.level)
XSPerfAccumulate("replayRedirect", io.redirect.valid && RedirectLevel.flushItself(io.redirect.bits.level))
val predRights = (0 until PredictWidth).map{i => !commitEntry.mispred(i) && !commitEntry.pd(i).notCFI && commitEntry.valids(i)}
val predWrongs = (0 until PredictWidth).map{i => commitEntry.mispred(i) && !commitEntry.pd(i).notCFI && commitEntry.valids(i)}
// Branch Predictor Perf counters
if (!env.FPGAPlatform && env.EnablePerfDebug) {
val fires = commitEntry.valids.zip(commitEntry.pd).map{case (valid, pd) => valid && !pd.notCFI}
val predRights = (0 until PredictWidth).map{i => !commitEntry.mispred(i) && !commitEntry.pd(i).notCFI && commitEntry.valids(i)}
val predWrongs = (0 until PredictWidth).map{i => commitEntry.mispred(i) && !commitEntry.pd(i).notCFI && commitEntry.valids(i)}
val isBTypes = (0 until PredictWidth).map{i => commitEntry.pd(i).isBr}
val isJTypes = (0 until PredictWidth).map{i => commitEntry.pd(i).isJal}
val isITypes = (0 until PredictWidth).map{i => commitEntry.pd(i).isJalr}
......@@ -424,4 +429,7 @@ class Ftq extends XSModule with HasCircularQueuePtrHelper {
XSDebug(io.commit_ftqEntry.valid, p"ftq commit: ${io.commit_ftqEntry.bits}")
XSDebug(io.enq.fire(), p"ftq enq: ${io.enq.bits}")
io.bpuInfo.bpRight := PopCount(predRights)
io.bpuInfo.bpWrong := PopCount(predWrongs)
}
......@@ -111,14 +111,39 @@ class FpuCsrIO extends XSBundle {
class PerfCounterIO extends XSBundle {
val retiredInstr = Input(UInt(3.W))
val value = Input(UInt(XLEN.W))
val retiredInstr = UInt(3.W)
val frontendInfo = new Bundle {
val ibufFull = Bool()
}
val ctrlInfo = new Bundle {
val roqFull = Bool()
val intdqFull = Bool()
val fpdqFull = Bool()
val lsdqFull = Bool()
}
val memInfo = new Bundle {
val sqFull = Bool()
val lqFull = Bool()
val dcacheMSHRFull = Bool()
}
val bpuInfo = new Bundle {
val bpRight = UInt(XLEN.W)
val bpWrong = UInt(XLEN.W)
}
val cacheInfo = new Bundle {
val l2MSHRFull = Bool()
val l3MSHRFull = Bool()
val l2nAcquire = UInt(XLEN.W)
val l2nAcquireMiss = UInt(XLEN.W)
val l3nAcquire = UInt(XLEN.W)
val l3nAcquireMiss = UInt(XLEN.W)
}
}
class CSRFileIO extends XSBundle {
val hartId = Input(UInt(64.W))
// output (for func === CSROpType.jmp)
val perf = new PerfCounterIO
val perf = Input(new PerfCounterIO)
val isPerfCnt = Output(Bool())
// to FPU
val fpu = Flipped(new FpuCsrIO)
......@@ -450,6 +475,26 @@ class CSR extends FunctionUnit with HasCSRConst
mcycle := mcycle + 1.U
val minstret = RegInit(0.U(XLEN.W))
minstret := minstret + RegNext(csrio.perf.retiredInstr)
val ibufFull = RegInit(0.U(XLEN.W))
ibufFull := ibufFull + RegNext(csrio.perf.frontendInfo.ibufFull)
val roqFull = RegInit(0.U(XLEN.W))
roqFull := roqFull + RegNext(csrio.perf.ctrlInfo.roqFull)
val intdqFull = RegInit(0.U(XLEN.W))
intdqFull := intdqFull + RegNext(csrio.perf.ctrlInfo.intdqFull)
val fpdqFull = RegInit(0.U(XLEN.W))
fpdqFull := fpdqFull + RegNext(csrio.perf.ctrlInfo.fpdqFull)
val lsdqFull = RegInit(0.U(XLEN.W))
lsdqFull := lsdqFull + RegNext(csrio.perf.ctrlInfo.lsdqFull)
val sqFull = RegInit(0.U(XLEN.W))
sqFull := sqFull + RegNext(csrio.perf.memInfo.sqFull)
val lqFull = RegInit(0.U(XLEN.W))
lqFull := lqFull + RegNext(csrio.perf.memInfo.lqFull)
val dcacheMSHRFull = RegInit(0.U(XLEN.W))
dcacheMSHRFull := dcacheMSHRFull + RegNext(csrio.perf.memInfo.dcacheMSHRFull)
val bpRight = RegInit(0.U(XLEN.W))
bpRight := bpRight + RegNext(csrio.perf.bpuInfo.bpRight)
val bpWrong = RegInit(0.U(XLEN.W))
bpWrong := bpWrong + RegNext(csrio.perf.bpuInfo.bpWrong)
// CSR reg map
val basicPrivMapping = Map(
......@@ -536,6 +581,16 @@ class CSR extends FunctionUnit with HasCSRConst
MaskedRegMap(Mcountinhibit, mcountinhibit),
MaskedRegMap(Mcycle, mcycle),
MaskedRegMap(Minstret, minstret),
MaskedRegMap(Mhpmevent3, ibufFull),
MaskedRegMap(Mhpmevent4, roqFull),
MaskedRegMap(Mhpmevent5, intdqFull),
MaskedRegMap(Mhpmevent6, fpdqFull),
MaskedRegMap(Mhpmevent7, lsdqFull),
MaskedRegMap(Mhpmevent8, sqFull),
MaskedRegMap(Mhpmevent9, lqFull),
MaskedRegMap(Mhpmevent10, dcacheMSHRFull),
MaskedRegMap(Mhpmevent11, bpRight),
MaskedRegMap(Mhpmevent12, bpWrong),
)
val MhpmcounterStart = Mhpmcounter3
val MhpmeventStart = Mhpmevent3
......
......@@ -261,6 +261,7 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper {
val bcommit = Output(UInt(BrTagWidth.W))
val roqDeqPtr = Output(new RoqPtr)
val csr = new RoqCSRIO
val roqFull = Output(Bool())
})
val difftestIO = IO(new Bundle() {
......@@ -796,6 +797,7 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper {
XSPerfAccumulate("clock_cycle", 1.U)
QueuePerf(RoqSize, PopCount((0 until RoqSize).map(valid(_))), !allowEnqueue)
io.roqFull := !allowEnqueue
XSPerfAccumulate("commitInstr", Mux(io.commits.isWalk, 0.U, PopCount(io.commits.valid)))
val commitIsMove = deqPtrVec.map(_.value).map(ptr => debug_microOp(ptr).ctrl.isMove)
XSPerfAccumulate("commitInstrMove", Mux(io.commits.isWalk, 0.U, PopCount(io.commits.valid.zip(commitIsMove).map{ case (v, m) => v && m })))
......
......@@ -108,6 +108,7 @@ class DCacheToLsuIO extends DCacheBundle {
class DCacheIO extends DCacheBundle {
val lsu = new DCacheToLsuIO
val error = new L1CacheErrorInfo
val mshrFull = Output(Bool())
}
......@@ -347,4 +348,6 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
// performance counters
val num_loads = PopCount(ldu.map(e => e.io.lsu.req.fire()))
XSPerfAccumulate("num_loads", num_loads)
io.mshrFull := missQueue.io.full
}
......@@ -382,6 +382,8 @@ class MissQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
// block probe
val probe_req = Input(UInt(PAddrBits.W))
val probe_block = Output(Bool())
val full = Output(Bool())
})
val pipe_req_arb = Module(new RRArbiter(new MainPipeReq, cfg.nMissEntries))
......@@ -544,5 +546,6 @@ class MissQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
// max inflight (average) = max_inflight_total / cycle cnt
XSPerfAccumulate("max_inflight", max_inflight)
QueuePerf(cfg.nMissEntries, num_valids, num_valids === cfg.nMissEntries.U)
io.full := num_valids === cfg.nMissEntries.U
XSPerfHistogram("num_valids", num_valids, true.B, 0, cfg.nMissEntries, 1)
}
......@@ -35,6 +35,9 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
val tlbCsr = Input(new TlbCsrBundle)
val csrCtrl = Input(new CustomCSRCtrlIO)
val error = new L1CacheErrorInfo
val frontendInfo = new Bundle {
val ibufFull = Output(Bool())
}
})
val ifu = Module(new IFU)
......@@ -106,4 +109,6 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
val frontendBubble = PopCount((0 until DecodeWidth).map(i => io.backend.cfVec(i).ready && !ibuffer.io.out(i).valid))
XSPerfAccumulate("FrontendBubble", frontendBubble)
io.frontendInfo.ibufFull := RegNext(ibuffer.io.full)
}
......@@ -24,6 +24,7 @@ class IBufferIO extends XSBundle {
val flush = Input(Bool())
val in = Flipped(DecoupledIO(new FetchPacket))
val out = Vec(DecodeWidth, DecoupledIO(new CtrlFlow))
val full = Output(Bool())
}
class Ibuffer extends XSModule with HasCircularQueuePtrHelper {
......@@ -201,6 +202,7 @@ class Ibuffer extends XSModule with HasCircularQueuePtrHelper {
val instrHungry = afterInit && (validEntries === 0.U) && !headBubble
QueuePerf(IBufSize, validEntries, !allowEnq)
io.full := !allowEnq
XSPerfAccumulate("flush", io.flush)
XSPerfAccumulate("hungry", instrHungry)
}
......@@ -56,6 +56,8 @@ class LsqWrappper extends XSModule with HasDCacheParameters {
val sqempty = Output(Bool())
val issuePtrExt = Output(new SqPtr)
val storeIssue = Vec(StorePipelineWidth, Flipped(Valid(new ExuInput)))
val sqFull = Output(Bool())
val lqFull = Output(Bool())
})
val difftestIO = IO(new Bundle() {
val fromSQ = new Bundle() {
......@@ -168,4 +170,6 @@ class LsqWrappper extends XSModule with HasDCacheParameters {
assert(!(loadQueue.io.uncache.resp.valid && storeQueue.io.uncache.resp.valid))
assert(!((loadQueue.io.uncache.resp.valid || storeQueue.io.uncache.resp.valid) && pendingstate === s_idle))
io.lqFull := loadQueue.io.lqFull
io.sqFull := storeQueue.io.sqFull
}
......@@ -77,6 +77,7 @@ class LoadQueue extends XSModule
val dcache = Flipped(ValidIO(new Refill))
val uncache = new DCacheWordIO
val exceptionAddr = new ExceptionAddrIO
val lqFull = Output(Bool())
})
val uop = Reg(Vec(LoadQueueSize, new MicroOp))
......@@ -641,6 +642,7 @@ class LoadQueue extends XSModule
// perf counter
QueuePerf(LoadQueueSize, validCount, !allowEnqueue)
io.lqFull := !allowEnqueue
XSPerfAccumulate("rollback", io.rollback.valid) // rollback redirect generated
XSPerfAccumulate("mmioCycle", uncacheState =/= s_idle) // lq is busy dealing with uncache req
XSPerfAccumulate("mmioCnt", io.uncache.req.fire())
......
......@@ -46,6 +46,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
val sqempty = Output(Bool())
val issuePtrExt = Output(new SqPtr)
val storeIssue = Vec(StorePipelineWidth, Flipped(Valid(new ExuInput)))
val sqFull = Output(Bool())
})
val difftestIO = IO(new Bundle() {
......@@ -433,6 +434,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
// perf counter
QueuePerf(StoreQueueSize, validCount, !allowEnqueue)
io.sqFull := !allowEnqueue
XSPerfAccumulate("mmioCycle", uncacheState =/= s_idle) // lq is busy dealing with uncache req
XSPerfAccumulate("mmioCnt", io.uncache.req.fire())
XSPerfAccumulate("mmio_wb_success", io.mmioStout.fire())
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册