提交 f773310b 编写于 作者: L lixin

Merge remote-tracking branch 'origin/master' into missqueue_enq_opt

......@@ -13,12 +13,13 @@ jobs:
continue-on-error: false
name: Generate Verilog
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v2
with:
submodules: 'recursive'
- name: set env
run: |
export HEAD_SHA=${{ github.run_number }}
echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
echo "WAVE_HOME=/nfs/home/ci-runner/xs-wave/${HEAD_SHA}" >> $GITHUB_ENV
mkdir -p /nfs/home/ci-runner/xs-wave/${HEAD_SHA}
......@@ -48,12 +49,13 @@ jobs:
timeout-minutes: 900
name: EMU - Basics
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v2
with:
submodules: 'recursive'
- name: set env
run: |
export HEAD_SHA=${{ github.run_number }}
echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
echo "AM_HOME=/nfs/home/share/ci-workloads/nexus-am" >> $GITHUB_ENV
echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV
......@@ -104,12 +106,13 @@ jobs:
timeout-minutes: 900
name: EMU - Performance
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v2
with:
submodules: 'recursive'
- name: set env
run: |
export HEAD_SHA=${{ github.run_number }}
echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
echo "AM_HOME=/nfs/home/share/ci-workloads/nexus-am" >> $GITHUB_ENV
echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV
......@@ -166,12 +169,13 @@ jobs:
timeout-minutes: 900
name: EMU - MC
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v2
with:
submodules: 'recursive'
- name: set env
run: |
export HEAD_SHA=${{ github.run_number }}
echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
echo "AM_HOME=/nfs/home/share/ci-workloads/nexus-am" >> $GITHUB_ENV
echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV
......@@ -199,12 +203,13 @@ jobs:
# timeout-minutes: 900
# name: SIMV - Basics
# steps:
# - uses: actions/checkout@v3
# - uses: actions/checkout@v2
# with:
# submodules: 'recursive'
# - name: set env
# run: |
# export HEAD_SHA=${{ github.run_number }}
# echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
# echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
# echo "AM_HOME=/nfs/home/share/ci-workloads/nexus-am" >> $GITHUB_ENV
# echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV
......
......@@ -13,12 +13,13 @@ jobs:
# Build + 8 checkpoints * 1-hour timeout
name: Nightly Regression - Checkpoints
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v2
with:
submodules: 'recursive'
- name: set env
run: |
export HEAD_SHA=${{ github.run_number }}
echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV
echo "NEMU_HOME=/nfs/home/share/ci-workloads/NEMU" >> $GITHUB_ENV
echo "PERF_HOME=/nfs/home/ci-runner/xs-perf/${HEAD_SHA}" >> $GITHUB_ENV
echo "WAVE_HOME=/nfs/home/ci-runner/xs-wave/${HEAD_SHA}" >> $GITHUB_ENV
......
......@@ -16,3 +16,6 @@
[submodule "utility"]
path = utility
url = https://github.com/OpenXiangShan/utility
[submodule "coupledL2"]
path = coupledL2
url = https://github.com/OpenXiangShan/coupledL2
......@@ -21,6 +21,7 @@ TOP_V = $(BUILD_DIR)/$(TOP).v
SCALA_FILE = $(shell find ./src/main/scala -name '*.scala')
TEST_FILE = $(shell find ./src/test/scala -name '*.scala')
MEM_GEN = ./scripts/vlsi_mem_gen
MEM_GEN_SEP = ./scripts/gen_sep_mem.sh
SIMTOP = top.SimTop
IMAGE ?= temp
......@@ -34,8 +35,8 @@ SIM_MEM_ARGS = --infer-rw --repl-seq-mem -c:$(SIMTOP):-o:$(@D)/$(@F).conf --gen-
# select firrtl compiler
ifeq ($(MFC),1)
override FC_ARGS = --mfc
override FPGA_MEM_ARGS = --infer-rw
override SIM_MEM_ARGS = --infer-rw
override FPGA_MEM_ARGS = --infer-rw --firtool-opt -split-verilog --firtool-opt -o --firtool-opt build --firtool-opt -repl-seq-mem --firtool-opt -repl-seq-mem-circuit=$(FPGATOP) --firtool-opt -repl-seq-mem-file=XSTop.v.conf
override SIM_MEM_ARGS = --infer-rw --firtool-opt -split-verilog --firtool-opt -o --firtool-opt build --firtool-opt -repl-seq-mem --firtool-opt -repl-seq-mem-circuit=$(SIMTOP) --firtool-opt -repl-seq-mem-file=SimTop.v.conf
endif
......@@ -47,9 +48,11 @@ endif
override SIM_ARGS += --with-dramsim3
endif
# top-down
ifeq ($(ENABLE_TOPDOWN),1)
override SIM_ARGS += --enable-topdown
# dynamic switch CONSTANTIN
ifeq ($(WITH_CONSTANTIN),0)
$(info disable WITH_CONSTANTIN)
else
override SIM_ARGS += --with-constantin
endif
# emu for the release version
......@@ -66,10 +69,6 @@ TIME_CMD = time -a -o $(TIMELOG)
SED_CMD = sed -i -e 's/_\(aw\|ar\|w\|r\|b\)_\(\|bits_\)/_\1/g'
# add comments to 'firrtl_black_box_resource_files'
AWK_CMD = gawk -i inplace 'BEGIN{f=0} /FILE "firrtl_black_box_resource_files.f"/{f=1} !f{print $$0} f{print "//", $$0}'
.DEFAULT_GOAL = verilog
help:
......@@ -82,10 +81,12 @@ $(TOP_V): $(SCALA_FILE)
$(FPGA_MEM_ARGS) \
--num-cores $(NUM_CORES) \
$(RELEASE_ARGS) $(FC_ARGS)
$(SED_CMD) $@
ifeq ($(MFC),1)
$(AWK_CMD) $@
for file in $(BUILD_DIR)/*.sv; do $(SED_CMD) "$${file}"; mv "$${file}" "$${file%.sv}.v"; done
mv $(BUILD_DIR)/$(BUILD_DIR)/* $(BUILD_DIR)
$(MEM_GEN_SEP) "$(MEM_GEN)" "$(TOP_V).conf" "$(BUILD_DIR)"
endif
$(SED_CMD) $@
@git log -n 1 >> .__head__
@git diff >> .__diff__
@sed -i 's/^/\/\// ' .__head__
......@@ -107,10 +108,12 @@ $(SIM_TOP_V): $(SCALA_FILE) $(TEST_FILE)
$(SIM_MEM_ARGS) \
--num-cores $(NUM_CORES) \
$(SIM_ARGS) $(FC_ARGS)
$(SED_CMD) $@
ifeq ($(MFC),1)
$(AWK_CMD) $@
for file in $(BUILD_DIR)/*.sv; do $(SED_CMD) "$${file}"; mv "$${file}" "$${file%.sv}.v"; done
mv $(BUILD_DIR)/$(BUILD_DIR)/* $(BUILD_DIR)
$(MEM_GEN_SEP) "$(MEM_GEN)" "$(SIM_TOP_V).conf" "$(BUILD_DIR)"
endif
$(SED_CMD) $@
@git log -n 1 >> .__head__
@git diff >> .__diff__
@sed -i 's/^/\/\// ' .__head__
......
......@@ -119,6 +119,17 @@ object huancun extends XSModule with SbtModule {
)
}
object coupledL2 extends XSModule with SbtModule {
override def millSourcePath = os.pwd / "coupledL2"
override def moduleDeps = super.moduleDeps ++ Seq(
rocketchip,
huancun,
utility
)
}
object difftest extends XSModule with SbtModule {
override def millSourcePath = os.pwd / "difftest"
}
......@@ -141,6 +152,7 @@ trait CommonXiangShan extends XSModule with SbtModule { m =>
def rocketModule: PublishModule
def difftestModule: PublishModule
def huancunModule: PublishModule
def coupledL2Module: PublishModule
def fudianModule: PublishModule
def utilityModule: PublishModule
......@@ -154,6 +166,7 @@ trait CommonXiangShan extends XSModule with SbtModule { m =>
rocketModule,
difftestModule,
huancunModule,
coupledL2Module,
fudianModule,
utilityModule
)
......@@ -174,6 +187,7 @@ object XiangShan extends CommonXiangShan {
override def rocketModule = rocketchip
override def difftestModule = difftest
override def huancunModule = huancun
override def coupledL2Module = coupledL2
override def fudianModule = fudian
override def utilityModule = utility
}
Subproject commit 5b65bc6d5f3d7bbbc2ae5f5726dfe2d257170a39
Subproject commit ea83bb7f84115ecfa0568f6697086f186827ea06
Subproject commit 41a2f27f21744351374e27724ce10a4c8354f400
Subproject commit d5b306ce44261e8a703ce6333fa6f4060d7f522c
Subproject commit b7308d958dfa7073e47ca10ce3974d267592049c
#!/bin/bash
mem_script=$1
conf_file=$2
output_dir=$3
IFS=$'\n'
for line in `cat $conf_file`; do
file=`echo "$line" | grep -oP '(?<=name )[^ ]*(?= .*)'`
echo $line >${conf_file}.tmp
${mem_script} ${conf_file}.tmp -o ${output_dir}/${file}.v
done
rm ${conf_file}.tmp
# top-down 分析工具
最新的 top-down 分析工具已经与 env-scripts 集成。在使用 `xs_autorun.py` 完成 checkpoint 的运行后,使用 `--report-top-down` 参数即可!
本仓库集成了 top-down 分析所需要的工具。
## 运行仿真
......
......@@ -31,14 +31,14 @@ tmp=$(grep "stall_loads_bound," $filename)
load_bound_cycles=${tmp##* }
tmp=$(grep "stall_ls_bandwidth_bound," $filename)
ls_dq_bound_cycles=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_rob," $filename)
stall_cycle_rob=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_int_dq," $filename)
stall_cycle_int_dq=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_fp_dq," $filename)
stall_cycle_fp_dq=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_ls_dq," $filename)
stall_cycle_ls_dq=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_rob_blame," $filename)
stall_cycle_rob_blame=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_int_blame," $filename)
stall_cycle_int_blame=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_fp_blame," $filename)
stall_cycle_fp_blame=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.dispatch: stall_cycle_ls_blame," $filename)
stall_cycle_ls_blame=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.rename: stall_cycle_fp," $filename)
stall_cycle_fp=${tmp##* }
tmp=$(grep "core_with_l2.core.ctrlBlock.rename: stall_cycle_int," $filename)
......
......@@ -107,7 +107,7 @@ def process_one(path, head):
csv_file['ifu2id_allNO_slots'] = use('ifu2id_allNO_cycle') * 6
csv_file['ifu2id_hvButNotFull_slots'] = use('fetch_bubbles') - use('ifu2id_allNO_slots')
stall_cycles_core = use('stall_cycle_fp') + use('stall_cycle_int') + use('stall_cycle_rob') + use('stall_cycle_int_dq') + use('stall_cycle_fp_dq') + use('ls_dq_bound_cycles')
stall_cycles_core = use('stall_cycle_fp') + use('stall_cycle_int') + use('stall_cycle_rob_blame') + use('stall_cycle_int_blame') + use('stall_cycle_fp_blame') + use('ls_dq_bound_cycles')
top = TopDown("Top", 1.0)
......@@ -146,9 +146,9 @@ def process_one(path, head):
loads_bound = memory_bound.add_down("Loads Bound", use('load_bound_cycles') / use('total_cycles'))
# top->backend_bound->core_bound
integer_dq = core_bound.add_down("Integer DQ", core_bound * use('stall_cycle_int_dq') / stall_cycles_core)
floatpoint_dq = core_bound.add_down("Floatpoint DQ", core_bound * use('stall_cycle_fp_dq') / stall_cycles_core)
rob = core_bound.add_down("ROB", core_bound * use('stall_cycle_rob') / stall_cycles_core)
integer_dq = core_bound.add_down("Integer DQ", core_bound * use('stall_cycle_int_blame') / stall_cycles_core)
floatpoint_dq = core_bound.add_down("Floatpoint DQ", core_bound * use('stall_cycle_fp_blame') / stall_cycles_core)
rob = core_bound.add_down("ROB", core_bound * use('stall_cycle_rob_blame') / stall_cycles_core)
integer_prf = core_bound.add_down("Integer PRF", core_bound * use('stall_cycle_int') / stall_cycles_core)
floatpoint_prf = core_bound.add_down("Floatpoint PRF", core_bound * use('stall_cycle_fp') / stall_cycles_core)
lsu_ports = core_bound.add_down("LSU Ports", core_bound * use('ls_dq_bound_cycles') / stall_cycles_core)
......
......@@ -87,20 +87,32 @@ func paramstr(chn, param) {
}
{
$1 = $NF; # timestamp
echo = $2;
user = $3;
data_1 = $4;
data_2 = $5;
data_3 = $6;
data_4 = $7;
sink = $9;
source = $10;
$1 = $14; # timestamp
$2 = $NF; # name
$3 = chnstr($13) # channel
$NF = ""; # remove log id
$5 = paramstr($3, $5) # param
$4 = opstr($3, $4) # opcode
$3 = chnstr($3) # channel
for(i=8; i<=12; i++){
if(i == 8){ # col 8 is address
$i = sprintf("%lx", $i);
} else { # cols 9-12 are data
$i = sprintf("%016lx", $i);
}
}
$13 = sprintf("user: %lx", $13);
$14 = sprintf("echo: %lx", $14);
$6 = sink;
$7 = source;
$5 = paramstr($13, $11) # param
$4 = opstr($13, $12) # opcode
$8 = sprintf("%lx", $8) # address
$9 = sprintf("%016lx", data_1)
$10 = sprintf("%016lx", data_2)
$11 = sprintf("%016lx", data_3)
$12 = sprintf("%016lx", data_4)
$13 = sprintf("user: %lx", user);
$14 = sprintf("echo: %lx", echo);
}
1 # print every line
......
......@@ -88,6 +88,7 @@ class XSArgs(object):
self.diff = self.diff.replace("nemu-interpreter", "spike")
self.fork = not args.disable_fork
self.disable_diff = args.no_diff
self.disable_db = args.no_db
# wave dump path
if args.wave_dump is not None:
self.set_wave_home(args.wave_dump)
......@@ -246,7 +247,8 @@ class XiangShan(object):
numa_args = f"numactl -m {numa_info[0]} -C {numa_info[1]}-{numa_info[2]}"
fork_args = "--enable-fork" if self.args.fork else ""
diff_args = "--no-diff" if self.args.disable_diff else ""
return_code = self.__exec_cmd(f'{numa_args} $NOOP_HOME/build/emu -i {workload} {emu_args} {fork_args} {diff_args}')
chiseldb_args = "--dump-db" if not self.args.disable_db else ""
return_code = self.__exec_cmd(f'{numa_args} $NOOP_HOME/build/emu -i {workload} {emu_args} {fork_args} {diff_args} {chiseldb_args}')
return return_code
def run_simv(self, workload):
......@@ -414,6 +416,7 @@ class XiangShan(object):
self.__exec_cmd(f"cp $NOOP_HOME/build/*.vcd $WAVE_HOME")
self.__exec_cmd(f"cp $NOOP_HOME/build/emu $WAVE_HOME")
self.__exec_cmd(f"cp $NOOP_HOME/build/SimTop.v $WAVE_HOME")
self.__exec_cmd(f"cp $NOOP_HOME/build/*.db $WAVE_HOME")
return ret
return 0
......@@ -436,6 +439,7 @@ class XiangShan(object):
self.__exec_cmd(f"cp $NOOP_HOME/build/*.vcd $WAVE_HOME")
self.__exec_cmd(f"cp $NOOP_HOME/build/emu $WAVE_HOME")
self.__exec_cmd(f"cp $NOOP_HOME/build/SimTop.v $WAVE_HOME")
self.__exec_cmd(f"cp $NOOP_HOME/build/*.db $WAVE_HOME")
return ret
return 0
......@@ -488,6 +492,7 @@ if __name__ == "__main__":
parser.add_argument('--disable-fork', action='store_true', help='disable lightSSS')
parser.add_argument('--no-diff', action='store_true', help='disable difftest')
parser.add_argument('--ram-size', nargs='?', type=str, help='manually set simulation memory size (8GB by default)')
parser.add_argument('--no-db', action='store_true', help='disable chiseldb dump')
args = parser.parse_args()
......
......@@ -24,14 +24,13 @@ import freechips.rocketchip.devices.tilelink.{CLINT, CLINTParams, DevNullParams,
import freechips.rocketchip.diplomacy.{AddressSet, IdRange, InModuleBody, LazyModule, LazyModuleImp, MemoryDevice, RegionType, SimpleDevice, TransferSizes}
import freechips.rocketchip.interrupts.{IntSourceNode, IntSourcePortSimple}
import freechips.rocketchip.regmapper.{RegField, RegFieldAccessType, RegFieldDesc, RegFieldGroup}
import utility.{BinaryArbiter, TLEdgeBuffer}
import utility.{BinaryArbiter, TLClientsMerger, TLEdgeBuffer, TLLogger}
import xiangshan.{DebugOptionsKey, HasXSParameter, XSBundle, XSCore, XSCoreParameters, XSTileKey}
import freechips.rocketchip.amba.axi4._
import freechips.rocketchip.tilelink._
import top.BusPerfMonitor
import xiangshan.backend.fu.PMAConst
import huancun._
import huancun.debug.TLLogger
case object SoCParamsKey extends Field[SoCParameters]
......@@ -42,7 +41,7 @@ case class SoCParameters
extIntrs: Int = 64,
L3NBanks: Int = 4,
L3CacheParamsOpt: Option[HCCacheParameters] = Some(HCCacheParameters(
name = "l3",
name = "L3",
level = 3,
ways = 8,
sets = 2048 // 1MB per bank
......@@ -148,10 +147,13 @@ trait HaveAXI4MemPort {
))
val mem_xbar = TLXbar()
val l3_mem_pmu = BusPerfMonitor(name = "L3_Mem", enable = !debugOpts.FPGAPlatform, stat_latency = true, add_reqkey = true)
mem_xbar :=*
TLBuffer.chainNode(2) :=
TLCacheCork() :=
l3_mem_pmu :=
TLClientsMerger() :=
TLXbar() :=*
TLBuffer.chainNode(2) :=*
TLCacheCork() :=*
bankedNode
mem_xbar :=
......@@ -232,10 +234,9 @@ class SoCMisc()(implicit p: Parameters) extends BaseSoC
val l3_in = TLTempNode()
val l3_out = TLTempNode()
val l3_mem_pmu = BusPerfMonitor(enable = !debugOpts.FPGAPlatform)
l3_in :*= TLEdgeBuffer(_ => true, Some("L3_in_buffer")) :*= l3_banked_xbar
bankedNode :*= TLLogger("MEM_L3", !debugOpts.FPGAPlatform) :*= l3_mem_pmu :*= l3_out
bankedNode :*= TLLogger("MEM_L3", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB) :*= l3_out
if(soc.L3CacheParamsOpt.isEmpty){
l3_out :*= l3_in
......@@ -247,7 +248,7 @@ class SoCMisc()(implicit p: Parameters) extends BaseSoC
for ((core_out, i) <- core_to_l3_ports.zipWithIndex){
l3_banked_xbar :=*
TLLogger(s"L3_L2_$i", !debugOpts.FPGAPlatform) :=*
TLLogger(s"L3_L2_$i", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB) :=*
TLBuffer() :=
core_out
}
......
......@@ -47,10 +47,11 @@ object ArgParser {
val c = Class.forName(prefix + confString).getConstructor(Integer.TYPE)
c.newInstance(1.asInstanceOf[Object]).asInstanceOf[Parameters]
}
def parse(args: Array[String]): (Parameters, Array[String], FirrtlCompiler) = {
def parse(args: Array[String]): (Parameters, Array[String], FirrtlCompiler, Array[String]) = {
val default = new DefaultConfig(1)
var firrtlOpts = Array[String]()
var firrtlCompiler: FirrtlCompiler = SFC
var firtoolOpts = Array[String]()
@tailrec
def nextOption(config: Parameters, list: List[String]): Parameters = {
list match {
......@@ -71,6 +72,10 @@ object ArgParser {
nextOption(config.alter((site, here, up) => {
case DebugOptionsKey => up(DebugOptionsKey).copy(UseDRAMSim = true)
}), tail)
case "--with-constantin" :: tail =>
nextOption(config.alter((site, here, up) => {
case DebugOptionsKey => up(DebugOptionsKey).copy(EnableConstantin = true)
}), tail)
case "--fpga-platform" :: tail =>
nextOption(config.alter((site, here, up) => {
case DebugOptionsKey => up(DebugOptionsKey).copy(FPGAPlatform = true)
......@@ -87,13 +92,12 @@ object ArgParser {
nextOption(config.alter((site, here, up) => {
case DebugOptionsKey => up(DebugOptionsKey).copy(EnablePerfDebug = false)
}), tail)
case "--enable-topdown" :: tail =>
nextOption(config.alter((site, here, up) => {
case DebugOptionsKey => up(DebugOptionsKey).copy(EnableTopDown = true)
}), tail)
case "--mfc" :: tail =>
firrtlCompiler = MFC
nextOption(config, tail)
case "--firtool-opt" :: option :: tail =>
firtoolOpts :+= option
nextOption(config, tail)
case option :: tail =>
// unknown option, maybe a firrtl option, skip
firrtlOpts :+= option
......@@ -101,6 +105,6 @@ object ArgParser {
}
}
var config = nextOption(default, args.toList)
(config, firrtlOpts, firrtlCompiler)
(config, firrtlOpts, firrtlCompiler, firtoolOpts)
}
}
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package top
import chipsalliance.rocketchip.config.Parameters
......@@ -6,13 +22,28 @@ import freechips.rocketchip.tilelink._
import chisel3._
import chisel3.util._
import utils.{XSPerfAccumulate, XSPerfPrint}
import freechips.rocketchip.tilelink.TLMessages._
import freechips.rocketchip.tilelink.TLPermissions._
import utility.{ReqSourceField, ReqSourceKey, GTimer}
import xiangshan.MemReqSource
class BusPerfMonitor()(implicit p: Parameters) extends LazyModule {
val node = TLAdapterNode()
lazy val module = new BusPerfMonitorImp(this)
class BusPerfMonitor(name: String, stat_latency: Boolean, add_reqkey: Boolean)(implicit p: Parameters) extends LazyModule {
val node = if (add_reqkey) TLAdapterNode(managerFn = { m =>
TLSlavePortParameters.v1(
m.managers.map { m =>
m.v2copy()
},
requestKeys = Seq(ReqSourceKey),
beatBytes = 32,
endSinkId = m.endSinkId
)
}) else {
TLAdapterNode()
}
lazy val module = new BusPerfMonitorImp(this, name, stat_latency)
}
class BusPerfMonitorImp(outer: BusPerfMonitor)
class BusPerfMonitorImp(outer: BusPerfMonitor, name: String, stat_latency: Boolean)
extends LazyModuleImp(outer)
{
......@@ -24,7 +55,7 @@ class BusPerfMonitorImp(outer: BusPerfMonitor)
def PERF_CHN[T <: TLChannel](clientName: String, chn: DecoupledIO[T]) = {
val channelName = chn.bits.channelName.replaceAll(" ", "_").replaceAll("'", "")
XSPerfAccumulate(s"${clientName}_${channelName}_fire", chn.fire())
XSPerfAccumulate(s"${clientName}_${channelName}_fire", chn.fire)
XSPerfAccumulate(s"${clientName}_${channelName}_stall", chn.valid && !chn.ready)
val ops = chn.bits match {
......@@ -40,28 +71,28 @@ class BusPerfMonitorImp(outer: BusPerfMonitor)
chn.bits match {
case a: TLBundleA =>
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
i.U === a.opcode && chn.fire()
i.U === a.opcode && chn.fire
)
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
i.U === a.opcode && chn.valid && !chn.ready
)
case b: TLBundleB =>
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
i.U === b.opcode && chn.fire()
i.U === b.opcode && chn.fire
)
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
i.U === b.opcode && chn.valid && !chn.ready
)
case c: TLBundleC =>
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
i.U === c.opcode && chn.fire()
i.U === c.opcode && chn.fire
)
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
i.U === c.opcode && chn.valid && !chn.ready
)
case d: TLBundleD =>
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_fire",
i.U === d.opcode && chn.fire()
i.U === d.opcode && chn.fire
)
XSPerfAccumulate(s"${clientName}_${channelName}_${op}_stall",
i.U === d.opcode && chn.valid && !chn.ready
......@@ -70,22 +101,86 @@ class BusPerfMonitorImp(outer: BusPerfMonitor)
}
}
for(((in, edgeIn), i) <- outer.node.in.zipWithIndex) {
val clientName = s"${edgeIn.master.masters.head.name}_bank_$i"
for (((in, edgeIn), i) <- outer.node.in.zipWithIndex) {
val clientName = s"${name}_${edgeIn.master.masters.head.name}_bank_$i"
PERF_CHN(clientName, in.a)
PERF_CHN(clientName, in.d)
if(in.params.hasBCE){
if (in.params.hasBCE) {
PERF_CHN(clientName, in.b)
PERF_CHN(clientName, in.c)
PERF_CHN(clientName, in.e)
}
}
if (stat_latency) {
val nrEdge = outer.node.in.length.toInt
val edgeIn = outer.node.in.head._2
class RecordEntry()(implicit p: Parameters) extends Bundle {
val valid = Bool()
val timeStamp = UInt(64.W)
val reqType = UInt(8.W)
}
// For simplicity, latency statistic works between nodes with SINGLE edge
require(nrEdge == 1)
val timer = GTimer()
val nrSource = math.pow(2, edgeIn.bundle.sourceBits).toInt
val latencyRecord = RegInit(VecInit(Seq.fill(nrSource)(0.U.asTypeOf(new RecordEntry()))))
val latencySum = RegInit(0.U(128.W))
val nrRecord = RegInit(0.U(128.W))
outer.node.in.zip(outer.node.out).zipWithIndex.foreach {
case (((in, edgeIn), (out, edgeOut)), i) =>
val channelA = in.a
when(channelA.fire &&
channelA.bits.opcode =/= Hint &&
channelA.bits.opcode =/= PutFullData &&
channelA.bits.opcode =/= PutPartialData
) {
// Valid channel A fire, record it
assert(latencyRecord(channelA.bits.source).valid === false.B)
latencyRecord(channelA.bits.source).valid := true.B
latencyRecord(channelA.bits.source).timeStamp := timer
latencyRecord(channelA.bits.source).reqType := channelA.bits.user.lift(ReqSourceKey).getOrElse(MemReqSource.NoWhere.id.U)
}
val channelD = in.d
val (first, _, _, _) = edgeIn.count(channelD)
// Valid channel D fire, resolve it
val resolveRecord = channelD.fire && first &&
channelD.bits.opcode =/= ReleaseAck &&
channelD.bits.opcode =/= AccessAck
val latency = WireInit(0.U(64.W))
when(resolveRecord) {
assert(latencyRecord(channelD.bits.source).valid === true.B)
latencyRecord(channelD.bits.source).valid := false.B
latency := timer - latencyRecord(channelD.bits.source).timeStamp
latencySum := latencySum + timer
nrRecord := nrRecord + 1.U
// printf("timer: %x\n", latency)
}
XSPerfAccumulate(name + "_nrRecord_all", resolveRecord)
XSPerfAccumulate(name + "_latencySum_all", Mux(resolveRecord, latency, 0.U))
for (j <- 0 until MemReqSource.ReqSourceCount.id) {
val typeMatch = latencyRecord(channelD.bits.source).reqType === j.U
XSPerfAccumulate(name + s"_nrRecord_type${j}", resolveRecord && typeMatch)
XSPerfAccumulate(name + s"_latencySum_type${j}", Mux(resolveRecord && typeMatch, latency, 0.U))
}
}
}
}
object BusPerfMonitor {
def apply(enable: Boolean = false)(implicit p: Parameters) = {
def apply(
name: String,
enable: Boolean = false,
stat_latency: Boolean = false,
add_reqkey: Boolean = false)(implicit p: Parameters) =
{
if(enable){
val busPMU = LazyModule(new BusPerfMonitor())
val busPMU = LazyModule(new BusPerfMonitor(name, stat_latency, add_reqkey))
busPMU.node
} else {
TLTempNode()
......
......@@ -33,6 +33,7 @@ import xiangshan.cache.DCacheParameters
import xiangshan.cache.mmu.{L2TLBParameters, TLBParameters}
import device.{EnableJtag, XSDebugModuleParams}
import huancun._
import coupledL2._
class BaseConfig(n: Int) extends Config((site, here, up) => {
case XLen => 64
......@@ -62,10 +63,16 @@ class MinimalConfig(n: Int = 1) extends Config(
FetchWidth = 4,
IssQueSize = 8,
NRPhyRegs = 64,
LoadQueueSize = 16,
LoadQueueNWriteBanks = 4,
VirtualLoadQueueSize = 16,
LoadQueueRARSize = 16,
LoadQueueRAWSize = 12,
LoadQueueReplaySize = 8,
LoadUncacheBufferSize = 8,
LoadQueueNWriteBanks = 4, // NOTE: make sure that LoadQueue{RAR, RAW, Replay}Size is divided by LoadQueueNWriteBanks.
RollbackGroupSize = 8,
StoreQueueSize = 12,
StoreQueueNWriteBanks = 4,
StoreQueueNWriteBanks = 4, // NOTE: make sure that StoreQueueSize is divided by StoreQueueNWriteBanks
StoreQueueForwardWithMask = true,
RobSize = 32,
FtqSize = 8,
IBufSize = 16,
......@@ -99,7 +106,8 @@ class MinimalConfig(n: Int = 1) extends Config(
nReleaseEntries = 1,
nProbeEntries = 2,
nPrefetchEntries = 2,
hasPrefetch = false
nPrefBufferEntries = 32,
hasPrefetch = true
),
dcacheParametersOpt = Some(DCacheParameters(
nSets = 64, // 32KB DCache
......@@ -173,7 +181,14 @@ class MinimalConfig(n: Int = 1) extends Config(
l3nWays = 8,
spSize = 2,
),
L2CacheParamsOpt = None, // remove L2 Cache
L2CacheParamsOpt = Some(L2Param(
name = "L2",
ways = 8,
sets = 128,
echoField = Seq(huancun.DirtyField()),
prefetch = None
)),
L2NBanks = 2,
prefetcher = None // if L2 pf_recv_node does not exist, disable SMS prefetcher
)
)
......@@ -183,14 +198,12 @@ class MinimalConfig(n: Int = 1) extends Config(
L3CacheParamsOpt = Some(up(SoCParamsKey).L3CacheParamsOpt.get.copy(
sets = 1024,
inclusive = false,
clientCaches = tiles.map{ p =>
CacheParameters(
"dcache",
sets = 2 * p.dcacheParametersOpt.get.nSets,
ways = p.dcacheParametersOpt.get.nWays + 2,
blockGranularity = log2Ceil(2 * p.dcacheParametersOpt.get.nSets),
aliasBitsOpt = None
)
clientCaches = tiles.map{ core =>
val clientDirBytes = tiles.map{ t =>
t.L2NBanks * t.L2CacheParamsOpt.map(_.toCacheParams.capacity).getOrElse(0)
}.sum
val l2params = core.L2CacheParamsOpt.get.toCacheParams
l2params.copy(sets = 2 * clientDirBytes / core.L2NBanks / l2params.ways / 64)
},
simulation = !site(DebugOptionsKey).FPGAPlatform
)),
......@@ -234,35 +247,25 @@ class WithNKBL2
n: Int,
ways: Int = 8,
inclusive: Boolean = true,
banks: Int = 1,
alwaysReleaseData: Boolean = false
banks: Int = 1
) extends Config((site, here, up) => {
case XSTileKey =>
val upParams = up(XSTileKey)
val l2sets = n * 1024 / banks / ways / 64
upParams.map(p => p.copy(
L2CacheParamsOpt = Some(HCCacheParameters(
L2CacheParamsOpt = Some(L2Param(
name = "L2",
level = 2,
ways = ways,
sets = l2sets,
inclusive = inclusive,
alwaysReleaseData = alwaysReleaseData,
clientCaches = Seq(CacheParameters(
clientCaches = Seq(L1Param(
"dcache",
sets = 2 * p.dcacheParametersOpt.get.nSets / banks,
ways = p.dcacheParametersOpt.get.nWays + 2,
blockGranularity = log2Ceil(2 * p.dcacheParametersOpt.get.nSets / banks),
aliasBitsOpt = p.dcacheParametersOpt.get.aliasBitsOpt
)),
reqField = Seq(PreferCacheField()),
echoField = Seq(DirtyField()),
prefetch = Some(huancun.prefetch.PrefetchReceiverParams()),
enablePerf = true,
sramDepthDiv = 2,
tagECC = Some("secded"),
dataECC = Some("secded"),
simulation = !site(DebugOptionsKey).FPGAPlatform
reqField = Seq(utility.ReqSourceField()),
echoField = Seq(huancun.DirtyField()),
prefetch = Some(coupledL2.prefetch.PrefetchReceiverParams())
)),
L2NBanks = banks
))
......@@ -292,6 +295,7 @@ class WithNKBL3(n: Int, ways: Int = 8, inclusive: Boolean = true, banks: Int = 1
address = 0x39000000,
numCores = tiles.size
)),
reqField = Seq(utility.ReqSourceField()),
sramClkDivBy2 = true,
sramDepthDiv = 4,
tagECC = Some("secded"),
......@@ -315,21 +319,21 @@ class DefaultL3DebugConfig(n: Int = 1) extends Config(
class MinimalAliasDebugConfig(n: Int = 1) extends Config(
new WithNKBL3(512, inclusive = false) ++
new WithNKBL2(256, inclusive = false, alwaysReleaseData = true) ++
new WithNKBL2(256, inclusive = false) ++
new WithNKBL1D(128) ++
new MinimalConfig(n)
)
class MediumConfig(n: Int = 1) extends Config(
new WithNKBL3(4096, inclusive = false, banks = 4)
++ new WithNKBL2(512, inclusive = false, alwaysReleaseData = true)
++ new WithNKBL2(512, inclusive = false)
++ new WithNKBL1D(128)
++ new BaseConfig(n)
)
class DefaultConfig(n: Int = 1) extends Config(
new WithNKBL3(6 * 1024, inclusive = false, banks = 4, ways = 6)
++ new WithNKBL2(2 * 512, inclusive = false, banks = 4, alwaysReleaseData = true)
++ new WithNKBL2(2 * 512, inclusive = false, banks = 4)
++ new WithNKBL1D(128)
++ new BaseConfig(n)
)
......@@ -44,7 +44,7 @@ case object MFC extends FirrtlCompiler
object Generator {
def execute(args: Array[String], mod: => RawModule, fc: FirrtlCompiler) = {
def execute(args: Array[String], mod: => RawModule, fc: FirrtlCompiler, firtoolOpts: Array[String]) = {
fc match {
case MFC =>
val sfcXsTransforms = Seq(
......@@ -69,9 +69,8 @@ object Generator {
})
(new circt.stage.ChiselStage).execute(mfcArgs, Seq(
ChiselGeneratorAnnotation(mod _),
circt.stage.CIRCTTargetAnnotation(circt.stage.CIRCTTarget.Verilog),
circt.stage.CIRCTHandover(circt.stage.CIRCTHandover.CHIRRTL)
))
circt.stage.CIRCTTargetAnnotation(circt.stage.CIRCTTarget.Verilog)
) ++ firtoolOpts.map(opt => circt.stage.FirtoolOption(opt)))
case SFC =>
(new XiangShanStage).execute(args, Seq(
ChiselGeneratorAnnotation(mod _),
......
......@@ -69,7 +69,7 @@ class XSTop()(implicit p: Parameters) extends BaseXSSoc() with HasSoCParameter
val l3cacheOpt = soc.L3CacheParamsOpt.map(l3param =>
LazyModule(new HuanCun()(new Config((_, _, _) => {
case HCCacheParamsKey => l3param.copy(enableTopDown = debugOpts.EnableTopDown)
case HCCacheParamsKey => l3param.copy(hartIds = tiles.map(_.HartId))
})))
)
......@@ -101,6 +101,8 @@ class XSTop()(implicit p: Parameters) extends BaseXSSoc() with HasSoCParameter
case Some(l3) =>
misc.l3_out :*= l3.node :*= TLBuffer.chainNode(2) :*= misc.l3_banked_xbar
case None =>
val dummyMatch = WireDefault(false.B)
tiles.map(_.HartId).foreach(hartId => ExcitingUtils.addSource(dummyMatch, s"L3MissMatch_${hartId}", ExcitingUtils.Perf, true))
}
lazy val module = new LazyRawModuleImp(this) {
......@@ -202,9 +204,17 @@ class XSTop()(implicit p: Parameters) extends BaseXSSoc() with HasSoCParameter
object TopMain extends App with HasRocketChipStageUtils {
override def main(args: Array[String]): Unit = {
val (config, firrtlOpts, firrtlComplier) = ArgParser.parse(args)
val (config, firrtlOpts, firrtlComplier, firtoolOpts) = ArgParser.parse(args)
// tools: init to close dpi-c when in fpga
val envInFPGA = config(DebugOptionsKey).FPGAPlatform
val enableChiselDB = config(DebugOptionsKey).EnableChiselDB
val enableConstantin = config(DebugOptionsKey).EnableConstantin
Constantin.init(enableConstantin && !envInFPGA)
ChiselDB.init(enableChiselDB && !envInFPGA)
val soc = DisableMonitors(p => LazyModule(new XSTop()(p)))(config)
Generator.execute(firrtlOpts, soc.module, firrtlComplier)
Generator.execute(firrtlOpts, soc.module, firrtlComplier, firtoolOpts)
FileRegisters.write(fileDir = "./build", filePrefix = "XSTop.")
}
}
......@@ -47,7 +47,7 @@ object XSLog {
if (!debugOpts.FPGAPlatform && (enableDebug || enablePerf || debugLevel == XSLogLevel.ERROR)) {
ExcitingUtils.addSink(logEnable, "DISPLAY_LOG_ENABLE")
ExcitingUtils.addSink(logTimestamp, "logTimestamp")
val check_cond = (if (debugLevel == XSLogLevel.ERROR) true.B else logEnable) && cond
val check_cond = (if (debugLevel == XSLogLevel.ERROR) true.B else logEnable) && cond && RegNext(true.B, false.B)
when (check_cond) {
val commonInfo = p"[$debugLevel][time=$logTimestamp] $MagicStr: "
printf((if (prefix) commonInfo else p"") + pable)
......
......@@ -31,6 +31,7 @@ import xiangshan.frontend.FtqPtr
import xiangshan.frontend.CGHPtr
import xiangshan.frontend.FtqRead
import xiangshan.frontend.FtqToCtrlIO
import xiangshan.cache.HasDCacheParameters
import utils._
import utility._
......@@ -57,14 +58,15 @@ object ValidUndirectioned {
}
object RSFeedbackType {
val tlbMiss = 0.U(3.W)
val mshrFull = 1.U(3.W)
val dataInvalid = 2.U(3.W)
val bankConflict = 3.U(3.W)
val ldVioCheckRedo = 4.U(3.W)
val lrqFull = 0.U(3.W)
val tlbMiss = 1.U(3.W)
val mshrFull = 2.U(3.W)
val dataInvalid = 3.U(3.W)
val bankConflict = 4.U(3.W)
val ldVioCheckRedo = 5.U(3.W)
val feedbackInvalid = 7.U(3.W)
val allTypes = 8
def apply() = UInt(3.W)
}
......@@ -89,7 +91,9 @@ class CfiUpdateInfo(implicit p: Parameters) extends XSBundle with HasBPUParamete
val histPtr = new CGHPtr
val specCnt = Vec(numBr, UInt(10.W))
// need pipeline update
val br_hit = Bool()
val br_hit = Bool() // if in ftb entry
val jr_hit = Bool() // if in ftb entry
val sc_hit = Bool() // if used in ftb entry, invalid if !br_hit
val predTaken = Bool()
val target = UInt(VAddrBits.W)
val taken = Bool()
......@@ -299,6 +303,8 @@ class Redirect(implicit p: Parameters) extends XSBundle {
val stFtqOffset = UInt(log2Up(PredictWidth).W)
val debug_runahead_checkpoint_id = UInt(64.W)
val debugIsCtrl = Bool()
val debugIsMemVio = Bool()
// def isUnconditional() = RedirectLevel.isUnconditional(level)
def flushItself() = RedirectLevel.flushItself(level)
......@@ -411,6 +417,7 @@ class MemRSFeedbackIO(implicit p: Parameters) extends XSBundle {
class FrontendToCtrlIO(implicit p: Parameters) extends XSBundle {
// to backend end
val cfVec = Vec(DecodeWidth, DecoupledIO(new CtrlFlow))
val stallReason = new StallReasonIO(DecodeWidth)
val fromFtq = new FtqToCtrlIO
// from backend
val toFtq = Flipped(new CtrlToFtqIO)
......@@ -659,3 +666,13 @@ class MatchTriggerIO(implicit p: Parameters) extends XSBundle {
val chain = Output(Bool())
val tdata2 = Output(UInt(64.W))
}
class StallReasonIO(width: Int) extends Bundle {
val reason = Output(Vec(width, UInt(log2Ceil(TopDownCounters.NumStallReasons.id).W)))
val backReason = Flipped(Valid(UInt(log2Ceil(TopDownCounters.NumStallReasons.id).W)))
}
// custom l2 - l1 interface
class L2ToL1Hint(implicit p: Parameters) extends XSBundle with HasDCacheParameters {
val sourceId = UInt(log2Up(cfg.nMissEntries).W) // tilelink sourceID -> mshr id
}
package xiangshan
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util.log2Ceil
import xiangshan.backend.rob.{DebugLsInfo, DebugMdpInfo}
import xiangshan.cache.DCacheBundle
/** Mem */
class LoadMissEntry(implicit p: Parameters) extends DCacheBundle {
val timeCnt = UInt(XLEN.W)
val robIdx = UInt(log2Ceil(RobSize).W)
val paddr = UInt(PAddrBits.W)
val vaddr = UInt(VAddrBits.W)
// 1:first hit, 2:first miss, 3:second miss
val missState = UInt(3.W)
}
class InstInfoEntry(implicit p: Parameters) extends XSBundle{
val globalID = UInt(XLEN.W)
val robIdx = UInt(log2Ceil(RobSize).W)
val instType = FuType()
val exceptType = UInt(ExceptionVec.ExceptionVecSize.W)
val ivaddr = UInt(VAddrBits.W)
val dvaddr = UInt(VAddrBits.W) // the l/s access address
val dpaddr = UInt(VAddrBits.W) // need the physical address when the TLB is valid
val tlbLatency = UInt(XLEN.W) // original requirements is L1toL2TlbLatency
val accessLatency = UInt(XLEN.W) // RS out time --> write back time
val executeLatency = UInt(XLEN.W)
val issueLatency = UInt(XLEN.W)
val lsInfo = new DebugLsInfo
val mdpInfo = new DebugMdpInfo
val issueTime = UInt(XLEN.W)
val writebackTime = UInt(XLEN.W)
}
\ No newline at end of file
......@@ -30,6 +30,7 @@ import freechips.rocketchip.diplomacy.AddressSet
import system.SoCParamsKey
import huancun._
import huancun.debug._
import coupledL2._
import xiangshan.mem.prefetch.{PrefetcherParams, SMSParams}
import scala.math.min
......@@ -66,6 +67,7 @@ case class XSCoreParameters
UbtbGHRLength: Int = 4,
// HistoryLength: Int = 512,
EnableGHistDiff: Boolean = true,
EnableCommitGHistDiff: Boolean = true,
UbtbSize: Int = 256,
FtbSize: Int = 2048,
RasSize: Int = 32,
......@@ -129,10 +131,16 @@ case class XSCoreParameters
EnableLoadFastWakeUp: Boolean = true, // NOTE: not supported now, make it false
IssQueSize: Int = 16,
NRPhyRegs: Int = 192,
LoadQueueSize: Int = 80,
LoadQueueNWriteBanks: Int = 8,
VirtualLoadQueueSize: Int = 80,
LoadQueueRARSize: Int = 80,
LoadQueueRAWSize: Int = 64, // NOTE: make sure that LoadQueueRAWSize is power of 2.
RollbackGroupSize: Int = 8,
LoadQueueReplaySize: Int = 80,
LoadUncacheBufferSize: Int = 20,
LoadQueueNWriteBanks: Int = 8, // NOTE: make sure that LoadQueueRARSize/LoadQueueRAWSize is divided by LoadQueueNWriteBanks
StoreQueueSize: Int = 64,
StoreQueueNWriteBanks: Int = 8,
StoreQueueNWriteBanks: Int = 8, // NOTE: make sure that StoreQueueSize is divided by StoreQueueNWriteBanks
StoreQueueForwardWithMask: Boolean = true,
VlsQueueSize: Int = 8,
RobSize: Int = 256,
dpParams: DispatchParameters = DispatchParameters(
......@@ -183,6 +191,8 @@ case class XSCoreParameters
superNWays = 4,
superReplacer = Some("plru")
),
itlbPortNum: Int = 2 + ICacheParameters().prefetchPipeNum + 1,
ipmpPortNum: Int = 2 + ICacheParameters().prefetchPipeNum + 1,
ldtlbParameters: TLBParameters = TLBParameters(
name = "ldtlb",
normalNSets = 64,
......@@ -237,7 +247,8 @@ case class XSCoreParameters
replacer = Some("setplru"),
nMissEntries = 2,
nProbeEntries = 2,
nPrefetchEntries = 2,
nPrefetchEntries = 12,
nPrefBufferEntries = 64,
hasPrefetch = true,
),
dcacheParametersOpt: Option[DCacheParameters] = Some(DCacheParameters(
......@@ -248,12 +259,11 @@ case class XSCoreParameters
nProbeEntries = 8,
nReleaseEntries = 18
)),
L2CacheParamsOpt: Option[HCCacheParameters] = Some(HCCacheParameters(
L2CacheParamsOpt: Option[L2Param] = Some(L2Param(
name = "l2",
level = 2,
ways = 8,
sets = 1024, // default 512KB L2
prefetch = Some(huancun.prefetch.PrefetchReceiverParams())
prefetch = Some(coupledL2.prefetch.PrefetchReceiverParams())
)),
L2NBanks: Int = 1,
usePTWRepeater: Boolean = false,
......@@ -287,7 +297,9 @@ case class DebugOptions
EnableDebug: Boolean = false,
EnablePerfDebug: Boolean = true,
UseDRAMSim: Boolean = false,
EnableTopDown: Boolean = false
EnableConstantin: Boolean = false,
EnableChiselDB: Boolean = false,
AlwaysBasicDB: Boolean = true,
)
trait HasXSParameter {
......@@ -329,6 +341,7 @@ trait HasXSParameter {
val EnbaleTlbDebug = coreParams.EnbaleTlbDebug
val HistoryLength = coreParams.HistoryLength
val EnableGHistDiff = coreParams.EnableGHistDiff
val EnableCommitGHistDiff = coreParams.EnableCommitGHistDiff
val UbtbGHRLength = coreParams.UbtbGHRLength
val UbtbSize = coreParams.UbtbSize
val EnableFauFTB = coreParams.EnableFauFTB
......@@ -390,10 +403,16 @@ trait HasXSParameter {
val PhyRegIdxWidth = log2Up(NRPhyRegs)
val RobSize = coreParams.RobSize
val IntRefCounterWidth = log2Ceil(RobSize)
val LoadQueueSize = coreParams.LoadQueueSize
val VirtualLoadQueueSize = coreParams.VirtualLoadQueueSize
val LoadQueueRARSize = coreParams.LoadQueueRARSize
val LoadQueueRAWSize = coreParams.LoadQueueRAWSize
val RollbackGroupSize = coreParams.RollbackGroupSize
val LoadQueueReplaySize = coreParams.LoadQueueReplaySize
val LoadUncacheBufferSize = coreParams.LoadUncacheBufferSize
val LoadQueueNWriteBanks = coreParams.LoadQueueNWriteBanks
val StoreQueueSize = coreParams.StoreQueueSize
val StoreQueueNWriteBanks = coreParams.StoreQueueNWriteBanks
val StoreQueueForwardWithMask = coreParams.StoreQueueForwardWithMask
val VlsQueueSize = coreParams.VlsQueueSize
val dpParams = coreParams.dpParams
val exuParameters = coreParams.exuParameters
......@@ -463,7 +482,7 @@ trait HasXSParameter {
val SSIDWidth = log2Up(LFSTSize)
val LFSTWidth = 4
val StoreSetEnable = true // LWT will be disabled if SS is enabled
val LFSTEnable = false
val loadExuConfigs = coreParams.loadExuConfigs
val storeExuConfigs = coreParams.storeExuConfigs
......
......@@ -248,6 +248,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
val l2_pf_enable = Output(Bool())
val perfEvents = Input(Vec(numPCntHc * coreParams.L2NBanks, new PerfEvent))
val beu_errors = Output(new XSL1BusErrors())
val l2Hint = Input(Valid(new L2ToL1Hint()))
})
println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}")
......@@ -302,7 +303,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
ctrlBlock.io.memoryViolation <> memBlock.io.memoryViolation
exuBlocks.head.io.scheExtra.enqLsq.get <> memBlock.io.enqLsq
exuBlocks.foreach(b => {
b.io.scheExtra.lcommit := ctrlBlock.io.robio.lsq.lcommit
b.io.scheExtra.lcommit := memBlock.io.lqDeq
b.io.scheExtra.scommit := memBlock.io.sqDeq
b.io.scheExtra.lqCancelCnt := memBlock.io.lqCancelCnt
b.io.scheExtra.sqCancelCnt := memBlock.io.sqCancelCnt
......@@ -321,9 +322,13 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
ctrlBlock.io.dispatch <> exuBlocks.flatMap(_.io.in)
ctrlBlock.io.rsReady := exuBlocks.flatMap(_.io.scheExtra.rsReady)
ctrlBlock.io.enqLsq <> memBlock.io.enqLsq
ctrlBlock.io.lqDeq := memBlock.io.lqDeq
ctrlBlock.io.sqDeq := memBlock.io.sqDeq
ctrlBlock.io.lqCanAccept := memBlock.io.lsqio.lqCanAccept
ctrlBlock.io.sqCanAccept := memBlock.io.lsqio.sqCanAccept
ctrlBlock.io.lqCancelCnt := memBlock.io.lqCancelCnt
ctrlBlock.io.sqCancelCnt := memBlock.io.sqCancelCnt
ctrlBlock.io.robHeadLsIssue := exuBlocks.map(_.io.scheExtra.robHeadLsIssue).reduce(_ || _)
exuBlocks(0).io.scheExtra.fpRfReadIn.get <> exuBlocks(1).io.scheExtra.fpRfReadOut.get
exuBlocks(0).io.scheExtra.fpStateReadIn.get <> exuBlocks(1).io.scheExtra.fpStateReadOut.get
......@@ -352,8 +357,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
exu.scheExtra.stIssuePtr <> memBlock.io.stIssuePtr
exu.scheExtra.debug_fp_rat <> ctrlBlock.io.debug_fp_rat
exu.scheExtra.debug_int_rat <> ctrlBlock.io.debug_int_rat
exu.scheExtra.lqFull := memBlock.io.lqFull
exu.scheExtra.sqFull := memBlock.io.sqFull
exu.scheExtra.robDeqPtr := ctrlBlock.io.robDeqPtr
exu.scheExtra.memWaitUpdateReq.staIssue.zip(memBlock.io.stIn).foreach{case (sink, src) => {
sink.bits := src.bits
sink.valid := src.valid
......@@ -419,6 +423,9 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
memBlock.io.lsqio.rob <> ctrlBlock.io.robio.lsq
memBlock.io.lsqio.exceptionAddr.isStore := CommitType.lsInstIsStore(ctrlBlock.io.robio.exception.bits.uop.ctrl.commitType)
memBlock.io.debug_ls <> ctrlBlock.io.robio.debug_ls
memBlock.io.lsTopdownInfo <> ctrlBlock.io.robio.lsTopdownInfo
memBlock.io.l2Hint.valid := io.l2Hint.valid
memBlock.io.l2Hint.bits.sourceId := io.l2Hint.bits.sourceId
val itlbRepeater1 = PTWFilter(itlbParams.fenceDelay,frontend.io.ptw, fenceio.sfence, csrioIn.tlb, l2tlbParams.ifilterSize)
val itlbRepeater2 = PTWRepeaterNB(passReady = false, itlbParams.fenceDelay, itlbRepeater1.io.ptw, ptw.io.tlb(0), fenceio.sfence, csrioIn.tlb)
......@@ -428,6 +435,8 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
ptw.io.csr.tlb <> csrioIn.tlb
ptw.io.csr.distribute_csr <> csrioIn.customCtrl.distribute_csr
ExcitingUtils.addSource(dtlbRepeater1.io.rob_head_miss_in_tlb, s"miss_in_dtlb_${coreParams.HartId}", ExcitingUtils.Perf, true)
// if l2 prefetcher use stream prefetch, it should be placed in XSCore
io.l2_pf_enable := csrioIn.customCtrl.l2_pf_enable
......
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan
import chisel3._
......@@ -7,11 +23,10 @@ import freechips.rocketchip.diplomacy._
import freechips.rocketchip.interrupts._
import freechips.rocketchip.tile.{BusErrorUnit, BusErrorUnitParams, BusErrors}
import freechips.rocketchip.tilelink._
import huancun.debug.TLLogger
import huancun.{HCCacheParamsKey, HuanCun}
import coupledL2.{L2ParamKey, CoupledL2}
import system.HasSoCParameter
import top.BusPerfMonitor
import utility.{DelayN, ResetGen, TLClientsMerger, TLEdgeBuffer}
import utility.{DelayN, ResetGen, TLClientsMerger, TLEdgeBuffer, TLLogger}
class L1BusErrorUnitInfo(implicit val p: Parameters) extends Bundle with HasSoCParameter {
val ecc_error = Valid(UInt(soc.PAddrBits.W))
......@@ -44,19 +59,20 @@ class XSTileMisc()(implicit p: Parameters) extends LazyModule
val beu = LazyModule(new BusErrorUnit(
new XSL1BusErrors(), BusErrorUnitParams(0x38010000)
))
val busPMU = BusPerfMonitor(enable = !debugOpts.FPGAPlatform)
val l1d_logger = TLLogger(s"L2_L1D_${coreParams.HartId}", !debugOpts.FPGAPlatform)
val misc_l2_pmu = BusPerfMonitor(name = "Misc_L2", enable = !debugOpts.FPGAPlatform)
val l2_l3_pmu = BusPerfMonitor(name = "L2_L3", enable = !debugOpts.FPGAPlatform, stat_latency = true)
val l1d_logger = TLLogger(s"L2_L1D_${coreParams.HartId}", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB)
val l2_binder = coreParams.L2CacheParamsOpt.map(_ => BankBinder(coreParams.L2NBanks, 64))
val i_mmio_port = TLTempNode()
val d_mmio_port = TLTempNode()
busPMU := l1d_logger
l1_xbar :=* busPMU
misc_l2_pmu := l1d_logger
l1_xbar :=* misc_l2_pmu
l2_binder match {
case Some(binder) =>
memory_port := TLBuffer.chainNode(2) := TLClientsMerger() := TLXbar() :=* binder
memory_port := TLBuffer.chainNode(2) := l2_l3_pmu := TLClientsMerger() := TLXbar() :=* binder
case None =>
memory_port := l1_xbar
}
......@@ -79,8 +95,8 @@ class XSTile()(implicit p: Parameters) extends LazyModule
private val core = LazyModule(new XSCore())
private val misc = LazyModule(new XSTileMisc())
private val l2cache = coreParams.L2CacheParamsOpt.map(l2param =>
LazyModule(new HuanCun()(new Config((_, _, _) => {
case HCCacheParamsKey => l2param.copy(enableTopDown = env.EnableTopDown)
LazyModule(new CoupledL2()(new Config((_, _, _) => {
case L2ParamKey => l2param.copy(hartIds = Seq(p(XSCoreParamsKey).HartId))
})))
)
......@@ -92,10 +108,11 @@ class XSTile()(implicit p: Parameters) extends LazyModule
val debug_int_sink = core.debug_int_sink
val beu_int_source = misc.beu.intNode
val core_reset_sink = BundleBridgeSink(Some(() => Reset()))
val l1d_l2_pmu = BusPerfMonitor(name = "L1d_L2", enable = !debugOpts.FPGAPlatform, stat_latency = true)
val l1d_to_l2_bufferOpt = coreParams.dcacheParametersOpt.map { _ =>
val buffer = LazyModule(new TLBuffer)
misc.l1d_logger := buffer.node := core.memBlock.dcache.clientNode
misc.l1d_logger := buffer.node := l1d_l2_pmu := core.memBlock.dcache.clientNode
buffer
}
......@@ -108,29 +125,21 @@ class XSTile()(implicit p: Parameters) extends LazyModule
(buffers, node)
}
val (l1i_to_l2_buffers, l1i_to_l2_buf_node) = chainBuffer(3, "l1i_to_l2_buffer")
misc.busPMU :=
TLLogger(s"L2_L1I_${coreParams.HartId}", !debugOpts.FPGAPlatform) :=
l1i_to_l2_buf_node :=
core.frontend.icache.clientNode
val ptw_to_l2_buffers = if (!coreParams.softPTW) {
val (buffers, buf_node) = chainBuffer(5, "ptw_to_l2_buffer")
misc.busPMU :=
TLLogger(s"L2_PTW_${coreParams.HartId}", !debugOpts.FPGAPlatform) :=
buf_node :=
core.ptw_to_l2_buffer.node
buffers
} else Seq()
misc.misc_l2_pmu := TLLogger(s"L2_L1I_${coreParams.HartId}", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB) := core.frontend.icache.clientNode
if (!coreParams.softPTW) {
misc.misc_l2_pmu := TLLogger(s"L2_PTW_${coreParams.HartId}", !debugOpts.FPGAPlatform && debugOpts.AlwaysBasicDB) := core.ptw_to_l2_buffer.node
}
l2cache match {
case Some(l2) =>
misc.l2_binder.get :*= l2.node :*= TLBuffer() :*= TLBuffer() :*= misc.l1_xbar
misc.l2_binder.get :*= l2.node :*= misc.l1_xbar
l2.pf_recv_node.map(recv => {
println("Connecting L1 prefetcher to L2!")
recv := core.memBlock.pf_sender_opt.get
})
case None =>
val dummyMatch = WireDefault(false.B)
ExcitingUtils.addSource(dummyMatch, s"L2MissMatch_${p(XSCoreParamsKey).HartId}", ExcitingUtils.Perf, true)
}
misc.i_mmio_port := core.frontend.instrUncache.clientNode
......@@ -150,8 +159,9 @@ class XSTile()(implicit p: Parameters) extends LazyModule
core.module.io.hartId := io.hartId
core.module.io.reset_vector := DelayN(io.reset_vector, 5)
io.cpu_halt := core.module.io.cpu_halt
if(l2cache.isDefined){
core.module.io.perfEvents.zip(l2cache.get.module.io.perfEvents.flatten).foreach(x => x._1.value := x._2)
if (l2cache.isDefined) {
// TODO: add perfEvents of L2
// core.module.io.perfEvents.zip(l2cache.get.module.io.perfEvents.flatten).foreach(x => x._1.value := x._2)
}
else {
core.module.io.perfEvents <> DontCare
......@@ -159,11 +169,17 @@ class XSTile()(implicit p: Parameters) extends LazyModule
misc.module.beu_errors.icache <> core.module.io.beu_errors.icache
misc.module.beu_errors.dcache <> core.module.io.beu_errors.dcache
if(l2cache.isDefined){
misc.module.beu_errors.l2.ecc_error.valid := l2cache.get.module.io.ecc_error.valid
misc.module.beu_errors.l2.ecc_error.bits := l2cache.get.module.io.ecc_error.bits
if (l2cache.isDefined) {
// TODO: add ECC interface of L2
// misc.module.beu_errors.l2.ecc_error.valid := l2cache.get.module.io.ecc_error.valid
// misc.module.beu_errors.l2.ecc_error.bits := l2cache.get.module.io.ecc_error.bits
misc.module.beu_errors.l2 <> 0.U.asTypeOf(misc.module.beu_errors.l2)
core.module.io.l2Hint.bits.sourceId := l2cache.get.module.io.l2_hint.bits
core.module.io.l2Hint.valid := l2cache.get.module.io.l2_hint.valid
} else {
misc.module.beu_errors.l2 <> 0.U.asTypeOf(misc.module.beu_errors.l2)
core.module.io.l2Hint.bits.sourceId := DontCare
core.module.io.l2Hint.valid := false.B
}
// Modules are reset one by one
......@@ -173,8 +189,6 @@ class XSTile()(implicit p: Parameters) extends LazyModule
// reset ----> OR_SYNC --> {Misc, L2 Cache, Cores}
val resetChain = Seq(
Seq(misc.module, core.module) ++
l1i_to_l2_buffers.map(_.module.asInstanceOf[MultiIOModule]) ++
ptw_to_l2_buffers.map(_.module.asInstanceOf[MultiIOModule]) ++
l1d_to_l2_bufferOpt.map(_.module) ++
l2cache.map(_.module)
)
......
......@@ -27,7 +27,7 @@ import xiangshan.backend.decode.{DecodeStage, FusionDecoder, ImmUnion}
import xiangshan.backend.dispatch.{Dispatch, Dispatch2Rs, DispatchQueue}
import xiangshan.backend.fu.PFEvent
import xiangshan.backend.rename.{Rename, RenameTableWrapper}
import xiangshan.backend.rob.{DebugLSIO, Rob, RobCSRIO, RobLsqIO}
import xiangshan.backend.rob.{DebugLSIO, LsTopdownInfo, Rob, RobCSRIO, RobLsqIO, RobPtr}
import xiangshan.frontend.{FtqPtr, FtqRead, Ftq_RF_Components}
import xiangshan.mem.mdp.{LFST, SSIT, WaitTable}
import xiangshan.ExceptionNO._
......@@ -87,6 +87,8 @@ class RedirectGenerator(implicit p: Parameters) extends XSModule
val redirect = Wire(Valid(new Redirect))
redirect.valid := exuOut.valid && exuOut.bits.redirect.cfiUpdate.isMisPred
redirect.bits := exuOut.bits.redirect
redirect.bits.debugIsCtrl := true.B
redirect.bits.debugIsMemVio := false.B
redirect
}
......@@ -211,9 +213,12 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val dispatch = Vec(3*dpParams.IntDqDeqWidth, DecoupledIO(new MicroOp))
val rsReady = Vec(outer.dispatch2.map(_.module.io.out.length).sum, Input(Bool()))
val enqLsq = Flipped(new LsqEnqIO)
val lqCancelCnt = Input(UInt(log2Up(LoadQueueSize + 1).W))
val lqCancelCnt = Input(UInt(log2Up(VirtualLoadQueueSize + 1).W))
val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W))
val lqDeq = Input(UInt(log2Up(CommitWidth + 1).W))
val sqDeq = Input(UInt(log2Ceil(EnsbufferWidth + 1).W))
val sqCanAccept = Input(Bool())
val lqCanAccept = Input(Bool())
val ld_pc_read = Vec(exuParameters.LduCnt, Flipped(new FtqRead(UInt(VAddrBits.W))))
// from int block
val exuRedirect = Vec(exuParameters.AluCnt + exuParameters.JmpCnt, Flipped(ValidIO(new ExuOutput)))
......@@ -229,6 +234,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val lsq = new RobLsqIO
// debug
val debug_ls = Flipped(new DebugLSIO)
val lsTopdownInfo = Vec(exuParameters.LduCnt, Input(new LsTopdownInfo))
}
val csrCtrl = Input(new CustomCSRCtrlIO)
val perfInfo = Output(new Bundle{
......@@ -242,8 +248,11 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val writeback = MixedVec(writebackLengths.map(num => Vec(num, Flipped(ValidIO(new ExuOutput)))))
// redirect out
val redirect = ValidIO(new Redirect)
// debug
val debug_int_rat = Vec(32, Output(UInt(PhyRegIdxWidth.W)))
val debug_fp_rat = Vec(32, Output(UInt(PhyRegIdxWidth.W)))
val robDeqPtr = Output(new RobPtr)
val robHeadLsIssue = Input(Bool())
})
override def writebackSource: Option[Seq[Seq[Valid[ExuOutput]]]] = {
......@@ -288,6 +297,8 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val flushRedirect = Wire(Valid(new Redirect))
flushRedirect.valid := RegNext(rob.io.flushOut.valid)
flushRedirect.bits := RegEnable(rob.io.flushOut.bits, rob.io.flushOut.valid)
flushRedirect.bits.debugIsCtrl := false.B
flushRedirect.bits.debugIsMemVio := false.B
val flushRedirectReg = Wire(Valid(new Redirect))
flushRedirectReg.valid := RegNext(flushRedirect.valid, init = false.B)
......@@ -310,7 +321,10 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
!io.memoryViolation.bits.robIdx.needFlush(Seq(stage2Redirect, redirectForExu)),
init = false.B
)
loadReplay.bits := RegEnable(io.memoryViolation.bits, io.memoryViolation.valid)
val memVioBits = WireDefault(io.memoryViolation.bits)
memVioBits.debugIsCtrl := false.B
memVioBits.debugIsMemVio := true.B
loadReplay.bits := RegEnable(memVioBits, io.memoryViolation.valid)
pcMem.io.raddr(2) := redirectGen.io.redirectPcRead.ptr.value
redirectGen.io.redirectPcRead.data := pcMem.io.rdata(2).getPc(RegNext(redirectGen.io.redirectPcRead.offset))
pcMem.io.raddr(3) := redirectGen.io.memPredPcRead.ptr.value
......@@ -360,40 +374,8 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
pendingRedirect := false.B
}
if (env.EnableTopDown) {
val stage2Redirect_valid_when_pending = pendingRedirect && stage2Redirect.valid
val stage2_redirect_cycles = RegInit(false.B) // frontend_bound->fetch_lantency->stage2_redirect
val MissPredPending = RegInit(false.B); val branch_resteers_cycles = RegInit(false.B) // frontend_bound->fetch_lantency->stage2_redirect->branch_resteers
val RobFlushPending = RegInit(false.B); val robFlush_bubble_cycles = RegInit(false.B) // frontend_bound->fetch_lantency->stage2_redirect->robflush_bubble
val LdReplayPending = RegInit(false.B); val ldReplay_bubble_cycles = RegInit(false.B) // frontend_bound->fetch_lantency->stage2_redirect->ldReplay_bubble
when(redirectGen.io.isMisspreRedirect) { MissPredPending := true.B }
when(flushRedirect.valid) { RobFlushPending := true.B }
when(redirectGen.io.loadReplay.valid) { LdReplayPending := true.B }
when (RegNext(io.frontend.toFtq.redirect.valid)) {
when(pendingRedirect) { stage2_redirect_cycles := true.B }
when(MissPredPending) { MissPredPending := false.B; branch_resteers_cycles := true.B }
when(RobFlushPending) { RobFlushPending := false.B; robFlush_bubble_cycles := true.B }
when(LdReplayPending) { LdReplayPending := false.B; ldReplay_bubble_cycles := true.B }
}
when(VecInit(decode.io.out.map(x => x.valid)).asUInt.orR){
when(stage2_redirect_cycles) { stage2_redirect_cycles := false.B }
when(branch_resteers_cycles) { branch_resteers_cycles := false.B }
when(robFlush_bubble_cycles) { robFlush_bubble_cycles := false.B }
when(ldReplay_bubble_cycles) { ldReplay_bubble_cycles := false.B }
}
XSPerfAccumulate("stage2_redirect_cycles", stage2_redirect_cycles)
XSPerfAccumulate("branch_resteers_cycles", branch_resteers_cycles)
XSPerfAccumulate("robFlush_bubble_cycles", robFlush_bubble_cycles)
XSPerfAccumulate("ldReplay_bubble_cycles", ldReplay_bubble_cycles)
XSPerfAccumulate("s2Redirect_pend_cycles", stage2Redirect_valid_when_pending)
}
decode.io.in <> io.frontend.cfVec
decode.io.stallReason.in <> io.frontend.stallReason
decode.io.csrCtrl := RegNext(io.csrCtrl)
decode.io.intRat <> rat.io.intReadPorts
decode.io.fpRat <> rat.io.fpReadPorts
......@@ -416,11 +398,15 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
waittable.io.csrCtrl := RegNext(io.csrCtrl)
// LFST lookup and update
val lfst = Module(new LFST)
lfst.io.redirect <> RegNext(io.redirect)
lfst.io.storeIssue <> RegNext(io.stIn)
lfst.io.csrCtrl <> RegNext(io.csrCtrl)
lfst.io.dispatch <> dispatch.io.lfst
dispatch.io.lfst := DontCare
if (LFSTEnable) {
val lfst = Module(new LFST)
lfst.io.redirect <> RegNext(io.redirect)
lfst.io.storeIssue <> RegNext(io.stIn)
lfst.io.csrCtrl <> RegNext(io.csrCtrl)
lfst.io.dispatch <> dispatch.io.lfst
}
rat.io.redirect := stage2Redirect.valid
rat.io.robCommits := rob.io.commits
......@@ -479,6 +465,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
rename.io.ssit <> ssit.io.rdata
rename.io.debug_int_rat <> rat.io.debug_int_rat
rename.io.debug_fp_rat <> rat.io.debug_fp_rat
rename.io.stallReason.in <> decode.io.stallReason.out
// pipeline between rename and dispatch
for (i <- 0 until RenameWidth) {
......@@ -492,6 +479,12 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
dispatch.io.toFpDq <> fpDq.io.enq
dispatch.io.toLsDq <> lsDq.io.enq
dispatch.io.allocPregs <> io.allocPregs
dispatch.io.robHead := rob.io.debugRobHead
dispatch.io.stallReason <> rename.io.stallReason.out
dispatch.io.lqCanAccept := io.lqCanAccept
dispatch.io.sqCanAccept := io.sqCanAccept
dispatch.io.robHeadNotReady := rob.io.headNotReady
dispatch.io.robFull := rob.io.robFull
dispatch.io.singleStep := RegNext(io.csrCtrl.singlestep)
intDq.io.redirect <> redirectForExu
......@@ -513,11 +506,12 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val lsqCtrl = Module(new LsqEnqCtrl)
lsqCtrl.io.redirect <> redirectForExu
lsqCtrl.io.enq <> dp2.enqLsq.get
lsqCtrl.io.lcommit := rob.io.lsq.lcommit
lsqCtrl.io.lcommit := io.lqDeq
lsqCtrl.io.scommit := io.sqDeq
lsqCtrl.io.lqCancelCnt := io.lqCancelCnt
lsqCtrl.io.sqCancelCnt := io.sqCancelCnt
io.enqLsq <> lsqCtrl.io.enqLsq
rob.io.debugEnqLsq := io.enqLsq
}
}
for ((dp2In, i) <- outer.dispatch2.flatMap(_.module.io.in).zipWithIndex) {
......@@ -570,6 +564,9 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
io.robio.lsq <> rob.io.lsq
rob.io.debug_ls := io.robio.debug_ls
rob.io.debugHeadLsIssue := io.robHeadLsIssue
rob.io.lsTopdownInfo := io.robio.lsTopdownInfo
io.robDeqPtr := rob.io.robDeqPtr
io.perfInfo.ctrlInfo.robFull := RegNext(rob.io.robFull)
io.perfInfo.ctrlInfo.intdqFull := RegNext(intDq.io.dqFull)
......
......@@ -21,16 +21,17 @@ import chisel3._
import chisel3.util._
import freechips.rocketchip.diplomacy.{BundleBridgeSource, LazyModule, LazyModuleImp}
import freechips.rocketchip.tile.HasFPUParameters
import huancun.PrefetchRecv
import coupledL2.PrefetchRecv
import utils._
import utility._
import xiangshan._
import xiangshan.backend.exu.StdExeUnit
import xiangshan.backend.fu._
import xiangshan.backend.rob.{DebugLSIO, RobLsqIO}
import xiangshan.backend.rob.{DebugLSIO, LsTopdownInfo, RobLsqIO}
import xiangshan.cache._
import xiangshan.cache.mmu.{VectorTlbPtwIO, TLBNonBlock, TlbReplace}
import xiangshan.mem._
import xiangshan.mem.mdp._
import xiangshan.mem.prefetch.{BasePrefecher, SMSParams, SMSPrefetcher}
class Std(implicit p: Parameters) extends FunctionUnit {
......@@ -73,7 +74,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val issue = Vec(exuParameters.LsExuCnt + exuParameters.StuCnt, Flipped(DecoupledIO(new ExuInput)))
val loadFastMatch = Vec(exuParameters.LduCnt, Input(UInt(exuParameters.LduCnt.W)))
val loadFastImm = Vec(exuParameters.LduCnt, Input(UInt(12.W)))
val rsfeedback = Vec(exuParameters.StuCnt, new MemRSFeedbackIO)
val rsfeedback = Vec(exuParameters.LsExuCnt, new MemRSFeedbackIO)
val loadPc = Vec(exuParameters.LduCnt, Input(UInt(VAddrBits.W))) // for hw prefetch
val stIssuePtr = Output(new SqPtr())
val int2vlsu = Flipped(new Int2VLSUIO)
......@@ -99,6 +100,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val lsqio = new Bundle {
val exceptionAddr = new ExceptionAddrIO // to csr
val rob = Flipped(new RobLsqIO) // rob to lsq
val lqCanAccept = Output(Bool())
val sqCanAccept = Output(Bool())
}
val csrCtrl = Flipped(new CustomCSRCtrlIO)
val csrUpdate = new DistributedCSRUpdateReq
......@@ -108,13 +111,14 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val lqFull = Output(Bool())
val dcacheMSHRFull = Output(Bool())
}
val sqFull = Output(Bool())
val lqFull = Output(Bool())
val perfEventsPTW = Input(Vec(19, new PerfEvent))
val lqCancelCnt = Output(UInt(log2Up(LoadQueueSize + 1).W))
val lqCancelCnt = Output(UInt(log2Up(VirtualLoadQueueSize + 1).W))
val sqCancelCnt = Output(UInt(log2Up(StoreQueueSize + 1).W))
val sqDeq = Output(UInt(log2Ceil(EnsbufferWidth + 1).W))
val lqDeq = Output(UInt(log2Up(CommitWidth + 1).W))
val debug_ls = new DebugLSIO
val lsTopdownInfo = Vec(exuParameters.LduCnt, Output(new LsTopdownInfo))
val l2Hint = Input(Valid(new L2ToL1Hint()))
})
override def writebackSource1: Option[Seq[Seq[DecoupledIO[ExuOutput]]]] = Some(Seq(io.writeback))
......@@ -169,7 +173,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
loadUnits.zipWithIndex.map(x => x._1.suggestName("LoadUnit_"+x._2))
storeUnits.zipWithIndex.map(x => x._1.suggestName("StoreUnit_"+x._2))
val atomicsUnit = Module(new AtomicsUnit)
// Atom inst comes from sta / std, then its result
......@@ -178,17 +181,17 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
// However, atom exception will be writebacked to rob
// using store writeback port
val loadWritebackOverride = Mux(atomicsUnit.io.out.valid, atomicsUnit.io.out.bits, loadUnits.head.io.ldout.bits)
val ldOut0 = Wire(Decoupled(new ExuOutput))
ldOut0.valid := atomicsUnit.io.out.valid || loadUnits.head.io.ldout.valid
ldOut0.bits := loadWritebackOverride
atomicsUnit.io.out.ready := ldOut0.ready
loadUnits.head.io.ldout.ready := ldOut0.ready
val loadWritebackOverride = Mux(atomicsUnit.io.out.valid, atomicsUnit.io.out.bits, loadUnits.head.io.loadOut.bits)
val loadOut0 = Wire(Decoupled(new ExuOutput))
loadOut0.valid := atomicsUnit.io.out.valid || loadUnits.head.io.loadOut.valid
loadOut0.bits := loadWritebackOverride
atomicsUnit.io.out.ready := loadOut0.ready
loadUnits.head.io.loadOut.ready := loadOut0.ready
when(atomicsUnit.io.out.valid){
ldOut0.bits.uop.cf.exceptionVec := 0.U(16.W).asBools // exception will be writebacked via store wb port
loadOut0.bits.uop.cf.exceptionVec := 0.U(16.W).asBools // exception will be writebacked via store wb port
}
val ldExeWbReqs = ldOut0 +: loadUnits.tail.map(_.io.ldout)
val ldExeWbReqs = loadOut0 +: loadUnits.tail.map(_.io.loadOut)
io.writeback <> ldExeWbReqs ++ VecInit(storeUnits.map(_.io.stout)) ++ VecInit(stdExeUnits.map(_.io.out))
io.otherFastWakeup := DontCare
io.otherFastWakeup.take(2).zip(loadUnits.map(_.io.fastUop)).foreach{case(a,b)=> a := b}
......@@ -203,7 +206,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
loadUnits(0).io.prefetch_req.bits.confidence := 0.U
l1_pf_req.ready := (l1_pf_req.bits.confidence > 0.U) ||
loadUnits.map(!_.io.ldin.valid).reduce(_ || _)
loadUnits.map(!_.io.loadIn.valid).reduce(_ || _)
// l1 pf fuzzer interface
val DebugEnableL1PFFuzzer = false
......@@ -223,7 +226,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
}
// TODO: fast load wakeup
val lsq = Module(new LsqWrappper)
val lsq = Module(new LsqWrapper)
val vlsq = Module(new DummyVectorLsq)
val sbuffer = Module(new Sbuffer)
// if you wants to stress test dcache store, use FakeSbuffer
......@@ -303,6 +306,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
io.debug_ls.debugLsInfo(i + exuParameters.LduCnt) := storeUnits(i).io.debug_ls
}
io.lsTopdownInfo := loadUnits.map(_.io.lsTopdownInfo)
// pmp
val pmp = Module(new PMP())
pmp.io.distribute_csr <> csrCtrl.distribute_csr
......@@ -335,12 +340,67 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
PrintTriggerInfo(tEnable(j), tdata(j))
// LoadUnit
class BalanceEntry extends XSBundle {
val balance = Bool()
val req = new LqWriteBundle
val port = UInt(log2Up(LoadPipelineWidth).W)
}
def balanceReOrder(sel: Seq[ValidIO[BalanceEntry]]): Seq[ValidIO[BalanceEntry]] = {
require(sel.length > 0)
val balancePick = ParallelPriorityMux(sel.map(x => (x.valid && x.bits.balance) -> x))
val reorderSel = Wire(Vec(sel.length, ValidIO(new BalanceEntry)))
(0 until sel.length).map(i =>
if (i == 0) {
when (balancePick.valid && balancePick.bits.balance) {
reorderSel(i) := balancePick
} .otherwise {
reorderSel(i) := sel(i)
}
} else {
when (balancePick.valid && balancePick.bits.balance && i.U === balancePick.bits.port) {
reorderSel(i) := sel(0)
} .otherwise {
reorderSel(i) := sel(i)
}
}
)
reorderSel
}
val fastReplaySel = loadUnits.zipWithIndex.map { case (ldu, i) => {
val wrapper = Wire(Valid(new BalanceEntry))
wrapper.valid := ldu.io.fastReplayOut.valid
wrapper.bits.req := ldu.io.fastReplayOut.bits
wrapper.bits.balance := ldu.io.fastReplayOut.bits.replayInfo.cause(LoadReplayCauses.bankConflict)
wrapper.bits.port := i.U
wrapper
}}
val balanceFastReplaySel = balanceReOrder(fastReplaySel)
for (i <- 0 until exuParameters.LduCnt) {
loadUnits(i).io.redirect <> redirect
loadUnits(i).io.rsIdx := io.rsfeedback(i).rsIdx // DontCare
loadUnits(i).io.isFirstIssue := true.B
// get input form dispatch
loadUnits(i).io.ldin <> io.issue(i)
loadUnits(i).io.loadIn <> io.issue(i)
loadUnits(i).io.feedbackSlow <> io.rsfeedback(i).feedbackSlow
loadUnits(i).io.feedbackFast <> io.rsfeedback(i).feedbackFast
loadUnits(i).io.rsIdx := io.rsfeedback(i).rsIdx
// fast replay
loadUnits(i).io.fastReplayIn.valid := balanceFastReplaySel(i).valid
loadUnits(i).io.fastReplayIn.bits := balanceFastReplaySel(i).bits.req
loadUnits(i).io.fastReplayOut.ready := false.B
for (j <- 0 until exuParameters.LduCnt) {
when (balanceFastReplaySel(j).valid && balanceFastReplaySel(j).bits.port === i.U) {
loadUnits(i).io.fastReplayOut.ready := loadUnits(j).io.fastReplayIn.ready
}
}
// get input form dispatch
loadUnits(i).io.loadIn <> io.issue(i)
// dcache access
loadUnits(i).io.dcache <> dcache.io.lsu.load(i)
// forward
......@@ -349,7 +409,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
loadUnits(i).io.tlDchannel := dcache.io.lsu.forward_D(i)
loadUnits(i).io.forward_mshr <> dcache.io.lsu.forward_mshr(i)
// ld-ld violation check
loadUnits(i).io.lsq.loadViolationQuery <> lsq.io.loadViolationQuery(i)
loadUnits(i).io.lsq.loadLoadViolationQuery <> lsq.io.ldu.loadLoadViolationQuery(i)
loadUnits(i).io.lsq.storeLoadViolationQuery <> lsq.io.ldu.storeLoadViolationQuery(i)
loadUnits(i).io.csrCtrl <> csrCtrl
// dcache refill req
loadUnits(i).io.refill <> delayedDcacheRefill
......@@ -357,10 +418,11 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
loadUnits(i).io.tlb <> dtlb_reqs.take(exuParameters.LduCnt)(i)
// pmp
loadUnits(i).io.pmp <> pmp_check(i).resp
// st-ld violation query
// st-ld violation query
for (s <- 0 until StorePipelineWidth) {
loadUnits(i).io.reExecuteQuery(s) := storeUnits(s).io.reExecuteQuery
}
loadUnits(i).io.lqReplayFull <> lsq.io.lqReplayFull
// prefetch
prefetcherOpt.foreach(pf => {
pf.io.ld_in(i).valid := Mux(pf_train_on_hit,
......@@ -383,32 +445,21 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val fastMatch = ParallelPriorityMux(fastValidVec, fastMatchVec)
loadUnits(i).io.loadFastMatch := fastMatch
loadUnits(i).io.loadFastImm := io.loadFastImm(i)
loadUnits(i).io.replay <> lsq.io.replay(i)
// Lsq to load unit's rs
// passdown to lsq (load s1)
lsq.io.loadPaddrIn(i) <> loadUnits(i).io.lsq.loadPaddrIn
lsq.io.loadVaddrIn(i) <> loadUnits(i).io.lsq.loadVaddrIn
lsq.io.replayFast(i) := loadUnits(i).io.lsq.replayFast
lsq.io.replaySlow(i) := loadUnits(i).io.lsq.replaySlow
loadUnits(i).io.lsqOut <> lsq.io.loadOut(i)
loadUnits(i).io.l2Hint <> io.l2Hint
// passdown to lsq (load s2)
lsq.io.loadIn(i) <> loadUnits(i).io.lsq.loadIn
lsq.io.ldout(i) <> loadUnits(i).io.lsq.ldout
lsq.io.ldu.loadIn(i) <> loadUnits(i).io.lsq.loadIn
lsq.io.loadOut(i) <> loadUnits(i).io.lsq.loadOut
lsq.io.ldRawDataOut(i) <> loadUnits(i).io.lsq.ldRawData
lsq.io.s2_load_data_forwarded(i) <> loadUnits(i).io.lsq.s2_load_data_forwarded
lsq.io.trigger(i) <> loadUnits(i).io.lsq.trigger
// passdown to lsq (load s3)
lsq.io.s2_dcache_require_replay(i) <> loadUnits(i).io.lsq.s2_dcache_require_replay
lsq.io.s3_replay_from_fetch(i) <> loadUnits(i).io.lsq.s3_replay_from_fetch
lsq.io.s3_delayed_load_error(i) <> loadUnits(i).io.s3_delayed_load_error
lsq.io.l2Hint.valid := io.l2Hint.valid
lsq.io.l2Hint.bits.sourceId := io.l2Hint.bits.sourceId
// alter writeback exception info
io.s3_delayed_load_error(i) := loadUnits(i).io.lsq.s3_delayed_load_error
io.s3_delayed_load_error(i) := loadUnits(i).io.s3_delayedLoadError
// update mem dependency predictor
// io.memPredUpdate(i) := DontCare
......@@ -458,25 +509,25 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
stdExeUnits(i).io.out := DontCare
stu.io.redirect <> redirect
stu.io.feedbackSlow <> io.rsfeedback(i).feedbackSlow
stu.io.rsIdx <> io.rsfeedback(i).rsIdx
stu.io.feedbackSlow <> io.rsfeedback(exuParameters.LduCnt + i).feedbackSlow
stu.io.rsIdx <> io.rsfeedback(exuParameters.LduCnt + i).rsIdx
// NOTE: just for dtlb's perf cnt
stu.io.isFirstIssue <> io.rsfeedback(i).isFirstIssue
stu.io.isFirstIssue <> io.rsfeedback(exuParameters.LduCnt + i).isFirstIssue
stu.io.stin <> io.issue(exuParameters.LduCnt + i)
stu.io.lsq <> lsq.io.storeIn(i)
stu.io.lsq_replenish <> lsq.io.storeInRe(i)
stu.io.lsq <> lsq.io.sta.storeAddrIn(i)
stu.io.lsq_replenish <> lsq.io.sta.storeAddrInRe(i)
// dtlb
stu.io.tlb <> dtlb_reqs.drop(exuParameters.LduCnt)(i)
stu.io.pmp <> pmp_check(i+exuParameters.LduCnt).resp
// store unit does not need fast feedback
io.rsfeedback(i).feedbackFast := DontCare
io.rsfeedback(exuParameters.LduCnt + i).feedbackFast := DontCare
// Lsq to sta unit
lsq.io.storeMaskIn(i) <> stu.io.storeMaskOut
lsq.io.sta.storeMaskIn(i) <> stu.io.storeMaskOut
// Lsq to std unit's rs
lsq.io.storeDataIn(i) := stData(i)
lsq.io.std.storeDataIn(i) := stData(i)
// 1. sync issue info to store set LFST
......@@ -545,6 +596,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
lsq.io.enq <> io.enqLsq
lsq.io.brqRedirect <> redirect
io.memoryViolation <> lsq.io.rollback
io.lsqio.lqCanAccept := lsq.io.lqCanAccept
io.lsqio.sqCanAccept := lsq.io.sqCanAccept
// lsq.io.uncache <> uncache.io.lsq
AddPipelineReg(lsq.io.uncache.req, uncache.io.lsq.req, false.B)
AddPipelineReg(uncache.io.lsq.resp, lsq.io.uncache.resp, false.B)
......@@ -553,11 +606,11 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
lsq.io.release := dcache.io.lsu.release
lsq.io.lqCancelCnt <> io.lqCancelCnt
lsq.io.sqCancelCnt <> io.sqCancelCnt
lsq.io.lqDeq <> io.lqDeq
lsq.io.sqDeq <> io.sqDeq
// LSQ to store buffer
lsq.io.sbuffer <> sbuffer.io.in
lsq.io.sqempty <> sbuffer.io.sqempty
lsq.io.sqEmpty <> sbuffer.io.sqempty
// Sbuffer
sbuffer.io.csrCtrl <> csrCtrl
......@@ -617,7 +670,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
atomicsUnit.io.storeDataIn.bits := Mux1H(Seq.tabulate(exuParameters.StuCnt)(i =>
st_data_atomics(i) -> stData(i).bits))
atomicsUnit.io.rsIdx := Mux1H(Seq.tabulate(exuParameters.StuCnt)(i =>
st_atomics(i) -> io.rsfeedback(atomic_replay_port_idx(i)).rsIdx))
st_atomics(i) -> io.rsfeedback(atomic_rs(i)).rsIdx))
atomicsUnit.io.redirect <> redirect
// TODO: complete amo's pmp support
......@@ -636,7 +689,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
when (state =/= s_normal) {
// use store wb port instead of load
loadUnits(0).io.ldout.ready := false.B
loadUnits(0).io.loadOut.ready := false.B
// use load_0's TLB
atomicsUnit.io.dtlb <> amoTlb
......@@ -644,11 +697,11 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
loadUnits.map(i => i.io.prefetch_req.valid := false.B)
// make sure there's no in-flight uops in load unit
assert(!loadUnits(0).io.ldout.valid)
assert(!loadUnits(0).io.loadOut.valid)
}
for (i <- 0 until exuParameters.StuCnt) when (state === s_atomics(i)) {
atomicsUnit.io.feedbackSlow <> io.rsfeedback(atomic_replay_port_idx(i)).feedbackSlow
atomicsUnit.io.feedbackSlow <> io.rsfeedback(atomic_rs(i)).feedbackSlow
assert(!storeUnits(i).io.feedbackSlow.valid)
}
......@@ -670,9 +723,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
io.memInfo.lqFull := RegNext(lsq.io.lqFull)
io.memInfo.dcacheMSHRFull := RegNext(dcache.io.mshrFull)
io.lqFull := lsq.io.lqFull
io.sqFull := lsq.io.sqFull
val ldDeqCount = PopCount(io.issue.take(exuParameters.LduCnt).map(_.valid))
val stDeqCount = PopCount(io.issue.drop(exuParameters.LduCnt).map(_.valid))
val rsDeqCount = ldDeqCount + stDeqCount
......
......@@ -30,6 +30,7 @@ import xiangshan.backend.fu.fpu.FMAMidResultIO
import xiangshan.backend.issue.ReservationStationWrapper
import xiangshan.backend.regfile.{Regfile, RfReadPort}
import xiangshan.backend.rename.{BusyTable, BusyTableReadIO}
import xiangshan.backend.rob.RobPtr
import xiangshan.mem.{LsqEnqCtrl, LsqEnqIO, MemWaitUpdateReq, SqPtr}
import chisel3.ExcitingUtils
......@@ -263,16 +264,15 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
val lcommit = Input(UInt(log2Up(CommitWidth + 1).W))
val scommit = Input(UInt(log2Ceil(EnsbufferWidth + 1).W)) // connected to `memBlock.io.sqDeq` instead of ROB
// from lsq
val lqCancelCnt = Input(UInt(log2Up(LoadQueueSize + 1).W))
val lqCancelCnt = Input(UInt(log2Up(VirtualLoadQueueSize + 1).W))
val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W))
val memWaitUpdateReq = Flipped(new MemWaitUpdateReq)
// debug
val debug_int_rat = Vec(32, Input(UInt(PhyRegIdxWidth.W)))
val debug_fp_rat = Vec(32, Input(UInt(PhyRegIdxWidth.W)))
// perf
val sqFull = Input(Bool())
val lqFull = Input(Bool())
val robDeqPtr = Input(new RobPtr)
val robHeadLsIssue = Output(Bool())
}
val numFma = outer.reservationStations.map(_.module.io.fmaMid.getOrElse(Seq()).length).sum
......@@ -525,6 +525,9 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
}
}
val lsRsDeqPorts = outer.reservationStations.filter(_.params.lsqFeedback).map(_.module.io.deq).flatten
io.extra.robHeadLsIssue := lsRsDeqPorts.map(deq => deq.fire && deq.bits.uop.robIdx === io.extra.robDeqPtr).reduceOption(_ || _).getOrElse(false.B)
if ((env.AlwaysBasicDiff || env.EnableDifftest) && intRfConfig._1) {
val difftest = Module(new DifftestArchIntRegState)
difftest.io.clock := clock
......@@ -543,20 +546,6 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
XSPerfAccumulate("issue_valid", PopCount(io.issue.map(_.valid)))
XSPerfAccumulate("issue_fire", PopCount(io.issue.map(_.fire)))
if (env.EnableTopDown && rs_all.exists(_.params.isLoad)) {
val stall_ls_dq = WireDefault(0.B)
ExcitingUtils.addSink(stall_ls_dq, "stall_ls_dq", ExcitingUtils.Perf)
val ld_rs_full = !rs_all.filter(_.params.isLoad).map(_.module.io.fromDispatch.map(_.ready).reduce(_ && _)).reduce(_ && _)
val st_rs_full = !rs_all.filter(rs => rs.params.isStore || rs.params.isStoreData).map(_.module.io.fromDispatch.map(_.ready).reduce(_ && _)).reduce(_ && _)
val stall_stores_bound = stall_ls_dq && (st_rs_full || io.extra.sqFull)
val stall_loads_bound = stall_ls_dq && (ld_rs_full || io.extra.lqFull)
val stall_ls_bandwidth_bound = stall_ls_dq && !(st_rs_full || io.extra.sqFull) && !(ld_rs_full || io.extra.lqFull)
ExcitingUtils.addSource(stall_loads_bound, "stall_loads_bound", ExcitingUtils.Perf)
XSPerfAccumulate("stall_loads_bound", stall_loads_bound)
XSPerfAccumulate("stall_stores_bound", stall_stores_bound)
XSPerfAccumulate("stall_ls_bandwidth_bound", stall_ls_bandwidth_bound)
}
val lastCycleAllocate = RegNext(VecInit(allocate.map(_.fire)))
val lastCycleIssue = RegNext(VecInit(io.issue.map(_.fire)))
val schedulerPerf = Seq(
......
......@@ -37,6 +37,10 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {
val csrCtrl = Input(new CustomCSRCtrlIO)
// perf only
val fusion = Vec(DecodeWidth - 1, Input(Bool()))
val stallReason = new Bundle {
val in = Flipped(new StallReasonIO(DecodeWidth))
val out = new StallReasonIO(DecodeWidth)
}
})
val decoders = Seq.fill(DecodeWidth)(Module(new DecodeUnit))
......@@ -72,16 +76,20 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {
debug_globalCounter := debug_globalCounter + PopCount(io.out.map(_.fire))
io.stallReason.in.backReason := io.stallReason.out.backReason
io.stallReason.out.reason.zip(io.stallReason.in.reason).zip(io.in.map(_.valid)).foreach { case ((out, in), valid) =>
out := Mux(io.stallReason.out.backReason.valid,
io.stallReason.out.backReason.bits,
Mux(valid, TopDownCounters.NoStall.id.U, in))
}
XSPerfAccumulate("utilization", PopCount(io.in.map(_.valid)))
XSPerfAccumulate("waitInstr", PopCount((0 until DecodeWidth).map(i => io.in(i).valid && !io.in(i).ready)))
XSPerfAccumulate("stall_cycle", hasValid && !io.out(0).ready)
if (env.EnableTopDown) {
XSPerfAccumulate("slots_issued", PopCount(io.out.map(_.fire)))
XSPerfAccumulate("decode_bubbles", PopCount(io.out.map(x => !x.valid && x.ready))) // Unutilized issue-pipeline slots while there is no backend-stall
XSPerfAccumulate("fetch_bubbles", PopCount((0 until DecodeWidth).map(i => !io.in(i).valid && io.in(i).ready))) //slots
XSPerfAccumulate("ifu2id_allNO_cycle", VecInit((0 until DecodeWidth).map(i => !io.in(i).valid && io.in(i).ready)).asUInt.andR)
}
XSPerfHistogram("slots_fire", PopCount(io.out.map(_.fire)), true.B, 0, DecodeWidth+1, 1)
XSPerfHistogram("slots_valid_pure", PopCount(io.in.map(_.valid)), io.out(0).fire, 0, DecodeWidth+1, 1)
XSPerfHistogram("slots_valid_rough", PopCount(io.in.map(_.valid)), true.B, 0, DecodeWidth+1, 1)
val fusionValid = RegNext(io.fusion)
val inFire = io.in.map(in => RegNext(in.valid && !in.ready))
......
......@@ -70,6 +70,13 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
val singleStep = Input(Bool())
// lfst
val lfst = new DispatchLFSTIO
// perf only
val robHead = Input(new MicroOp)
val stallReason = Flipped(new StallReasonIO(RenameWidth))
val lqCanAccept = Input(Bool())
val sqCanAccept = Input(Bool())
val robHeadNotReady = Input(Bool())
val robFull = Input(Bool())
})
/**
......@@ -105,7 +112,7 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
val updatedUop = Wire(Vec(RenameWidth, new MicroOp))
val updatedCommitType = Wire(Vec(RenameWidth, CommitType()))
val checkpoint_id = RegInit(0.U(64.W))
checkpoint_id := checkpoint_id + PopCount((0 until RenameWidth).map(i =>
checkpoint_id := checkpoint_id + PopCount((0 until RenameWidth).map(i =>
io.fromRename(i).fire()
))
......@@ -151,7 +158,7 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
if(i == 0){
debug_runahead_checkpoint_id := checkpoint_id
} else {
debug_runahead_checkpoint_id := checkpoint_id + PopCount((0 until i).map(i =>
debug_runahead_checkpoint_id := checkpoint_id + PopCount((0 until i).map(i =>
io.fromRename(i).fire()
))
}
......@@ -204,6 +211,7 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
// (1) resources are ready
// (2) previous instructions are ready
val thisCanActualOut = (0 until RenameWidth).map(i => !thisIsBlocked(i) && notBlockedByPrevious(i))
val thisActualOut = (0 until RenameWidth).map(i => io.enqRob.req(i).valid && io.enqRob.canAccept)
val hasValidException = io.fromRename.zip(hasException).map(x => x._1.valid && x._2)
// input for ROB, LSQ, Dispatch Queue
......@@ -264,31 +272,92 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
PopCount(io.toLsDq.req.map(_.valid && io.toLsDq.canAccept))
XSError(enqFireCnt > renameFireCnt, "enqFireCnt should not be greater than renameFireCnt\n")
val stall_rob = hasValidInstr && !io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept
val stall_int_dq = hasValidInstr && io.enqRob.canAccept && !io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept
val stall_fp_dq = hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && !io.toFpDq.canAccept && io.toLsDq.canAccept
val stall_ls_dq = hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && !io.toLsDq.canAccept
XSPerfAccumulate("in", Mux(RegNext(io.fromRename(0).ready), PopCount(io.fromRename.map(_.valid)), 0.U))
XSPerfAccumulate("empty", !hasValidInstr)
XSPerfAccumulate("utilization", PopCount(io.fromRename.map(_.valid)))
XSPerfAccumulate("waitInstr", PopCount((0 until RenameWidth).map(i => io.fromRename(i).valid && !io.recv(i))))
XSPerfAccumulate("stall_cycle_rob", hasValidInstr && !io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept)
XSPerfAccumulate("stall_cycle_int_dq", hasValidInstr && io.enqRob.canAccept && !io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept)
XSPerfAccumulate("stall_cycle_fp_dq", hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && !io.toFpDq.canAccept && io.toLsDq.canAccept)
XSPerfAccumulate("stall_cycle_ls_dq", hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && !io.toLsDq.canAccept)
if (env.EnableTopDown) {
val stall_ls_dq = hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && !io.toLsDq.canAccept
ExcitingUtils.addSource(stall_ls_dq, "stall_ls_dq", ExcitingUtils.Perf)
// TODO: we may need finer counters to count responding slots more precisely, i.e. per-slot granularity.
XSPerfAccumulate("stall_cycle_rob", stall_rob)
XSPerfAccumulate("stall_cycle_int_dq", stall_int_dq)
XSPerfAccumulate("stall_cycle_fp_dq", stall_fp_dq)
XSPerfAccumulate("stall_cycle_ls_dq", stall_ls_dq)
val Seq(notIssue, tlbReplay, tlbMiss, vioReplay, mshrReplay, l1Miss, l2Miss, l3Miss) =
Seq.fill(8)(WireDefault(false.B))
ExcitingUtils.addSink(notIssue, s"rob_head_ls_issue_${coreParams.HartId}", ExcitingUtils.Perf)
ExcitingUtils.addSink(tlbReplay, s"load_tlb_replay_stall_${coreParams.HartId}", ExcitingUtils.Perf)
ExcitingUtils.addSink(tlbMiss, s"load_tlb_miss_stall_${coreParams.HartId}", ExcitingUtils.Perf)
ExcitingUtils.addSink(vioReplay, s"load_vio_replay_stall_${coreParams.HartId}", ExcitingUtils.Perf)
ExcitingUtils.addSink(mshrReplay, s"load_mshr_replay_stall_${coreParams.HartId}", ExcitingUtils.Perf)
ExcitingUtils.addSink(l1Miss, s"load_l1_miss_${coreParams.HartId}", ExcitingUtils.Perf)
ExcitingUtils.addSink(l2Miss, s"L2MissMatch_${coreParams.HartId}", ExcitingUtils.Perf)
ExcitingUtils.addSink(l3Miss, s"L3MissMatch_${coreParams.HartId}", ExcitingUtils.Perf)
val ldReason = Mux(l3Miss, TopDownCounters.LoadMemStall.id.U,
Mux(l2Miss, TopDownCounters.LoadL3Stall.id.U,
Mux(l1Miss, TopDownCounters.LoadL2Stall.id.U,
Mux(notIssue, TopDownCounters.MemNotReadyStall.id.U,
Mux(tlbMiss, TopDownCounters.LoadTLBStall.id.U,
Mux(tlbReplay, TopDownCounters.LoadTLBStall.id.U,
Mux(mshrReplay, TopDownCounters.LoadMSHRReplayStall.id.U,
Mux(vioReplay, TopDownCounters.LoadVioReplayStall.id.U,
TopDownCounters.LoadL1Stall.id.U))))))))
val stallReason = Wire(chiselTypeOf(io.stallReason.reason))
val realFired = io.recv.zip(io.fromRename.map(_.valid)).map(x => x._1 && x._2)
io.stallReason.backReason.valid := !io.recv.head
io.stallReason.backReason.bits := TopDownCounters.OtherCoreStall.id.U
stallReason.zip(io.stallReason.reason).zip(io.recv).zip(realFired).map { case (((update, in), recv), fire) =>
import FuType._
val headIsInt = isIntExu(io.robHead.ctrl.fuType) && io.robHeadNotReady
val headIsFp = isFpExu(io.robHead.ctrl.fuType) && io.robHeadNotReady
val headIsDiv = isDivSqrt(io.robHead.ctrl.fuType) && io.robHeadNotReady
val headIsLd = io.robHead.ctrl.fuType === ldu && io.robHeadNotReady || !io.lqCanAccept
val headIsSt = io.robHead.ctrl.fuType === stu && io.robHeadNotReady || !io.sqCanAccept
val headIsAmo = io.robHead.ctrl.fuType === mou && io.robHeadNotReady
val headIsLs = headIsLd || headIsSt
val robLsFull = io.robFull || !io.lqCanAccept || !io.sqCanAccept
import TopDownCounters._
update := MuxCase(OtherCoreStall.id.U, Seq(
// fire
(fire ) -> NoStall.id.U ,
// dispatch not stall / core stall from rename
(in =/= OtherCoreStall.id.U ) -> in ,
// dispatch queue stall
(!io.toIntDq.canAccept && !headIsInt && !io.robFull) -> IntDqStall.id.U ,
(!io.toFpDq.canAccept && !headIsFp && !io.robFull) -> FpDqStall.id.U ,
(!io.toLsDq.canAccept && !headIsLs && !robLsFull ) -> LsDqStall.id.U ,
// rob stall
(headIsAmo ) -> AtomicStall.id.U ,
(headIsSt ) -> StoreStall.id.U ,
(headIsLd ) -> ldReason ,
(headIsDiv ) -> DivStall.id.U ,
(headIsInt ) -> IntNotReadyStall.id.U ,
(headIsFp ) -> FPNotReadyStall.id.U ,
))
}
TopDownCounters.values.foreach(ctr => XSPerfAccumulate(ctr.toString(), PopCount(stallReason.map(_ === ctr.id.U))))
XSPerfHistogram("slots_fire", PopCount(thisActualOut), true.B, 0, RenameWidth+1, 1)
// Explaination: when out(0) not fire, PopCount(valid) is not meaningfull
XSPerfHistogram("slots_valid_pure", PopCount(io.enqRob.req.map(_.valid)), thisActualOut(0), 0, RenameWidth+1, 1)
XSPerfHistogram("slots_valid_rough", PopCount(io.enqRob.req.map(_.valid)), true.B, 0, RenameWidth+1, 1)
val perfEvents = Seq(
("dispatch_in", PopCount(io.fromRename.map(_.valid & io.fromRename(0).ready)) ),
("dispatch_empty", !hasValidInstr ),
("dispatch_utili", PopCount(io.fromRename.map(_.valid)) ),
("dispatch_waitinstr", PopCount((0 until RenameWidth).map(i => io.fromRename(i).valid && !io.recv(i))) ),
("dispatch_stall_cycle_lsq", false.B ),
("dispatch_stall_cycle_rob", hasValidInstr && !io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept),
("dispatch_stall_cycle_int_dq", hasValidInstr && io.enqRob.canAccept && !io.toIntDq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept),
("dispatch_stall_cycle_fp_dq", hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && !io.toFpDq.canAccept && io.toLsDq.canAccept),
("dispatch_stall_cycle_ls_dq", hasValidInstr && io.enqRob.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept && !io.toLsDq.canAccept)
("dispatch_in", PopCount(io.fromRename.map(_.valid & io.fromRename(0).ready)) ),
("dispatch_empty", !hasValidInstr ),
("dispatch_utili", PopCount(io.fromRename.map(_.valid)) ),
("dispatch_waitinstr", PopCount((0 until RenameWidth).map(i => io.fromRename(i).valid && !io.recv(i)))),
("dispatch_stall_cycle_lsq", false.B ),
("dispatch_stall_cycle_rob", stall_rob ),
("dispatch_stall_cycle_int_dq", stall_int_dq ),
("dispatch_stall_cycle_fp_dq", stall_fp_dq ),
("dispatch_stall_cycle_ls_dq", stall_ls_dq )
)
generatePerfEvent()
}
......@@ -98,10 +98,8 @@ class ReservationStationWrapper(implicit p: Parameters) extends LazyModule with
}
if (cfg == StaExeUnitCfg || cfg == LdExeUnitCfg) {
params.lsqFeedback = true
params.checkWaitBit = true
}
if(cfg == StaExeUnitCfg) {
params.hasFeedback = true
params.checkWaitBit = false
}
if (cfg.hasCertainLatency) {
params.fixedLatency = if (cfg == MulDivExeUnitCfg) mulCfg.latency.latencyVal.get else cfg.latency.latencyVal.get
......@@ -939,21 +937,6 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
}
}
if (env.EnableTopDown && params.isLoad) {
val l1d_loads_bound = WireDefault(0.B)
ExcitingUtils.addSink(l1d_loads_bound, "l1d_loads_bound", ExcitingUtils.Perf)
val mshrFull = statusArray.io.rsFeedback(RSFeedbackType.mshrFull.litValue.toInt)
val tlbMiss = !mshrFull && statusArray.io.rsFeedback(RSFeedbackType.tlbMiss.litValue.toInt)
val dataInvalid = !mshrFull && !tlbMiss && statusArray.io.rsFeedback(RSFeedbackType.dataInvalid.litValue.toInt)
val bankConflict = !mshrFull && !tlbMiss && !dataInvalid && statusArray.io.rsFeedback(RSFeedbackType.bankConflict.litValue.toInt)
val ldVioCheckRedo = !mshrFull && !tlbMiss && !dataInvalid && !bankConflict && statusArray.io.rsFeedback(RSFeedbackType.ldVioCheckRedo.litValue.toInt)
XSPerfAccumulate("l1d_loads_mshr_bound", l1d_loads_bound && mshrFull)
XSPerfAccumulate("l1d_loads_tlb_bound", l1d_loads_bound && tlbMiss)
XSPerfAccumulate("l1d_loads_store_data_bound", l1d_loads_bound && dataInvalid)
XSPerfAccumulate("l1d_loads_bank_conflict_bound", l1d_loads_bound && bankConflict)
XSPerfAccumulate("l1d_loads_vio_check_redo_bound", l1d_loads_bound && ldVioCheckRedo)
}
XSPerfAccumulate("redirect_num", io.redirect.valid)
XSPerfAccumulate("allocate_num", PopCount(s0_doEnqueue))
XSPerfHistogram("issue_num", PopCount(io.deq.map(_.valid)), true.B, 0, params.numDeq, 1)
......
......@@ -99,7 +99,7 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
val stIssuePtr = if (params.checkWaitBit) Input(new SqPtr()) else null
val memWaitUpdateReq = if (params.checkWaitBit) Flipped(new MemWaitUpdateReq) else null
val rsFeedback = Output(Vec(5, Bool()))
val rsFeedback = Output(Vec(RSFeedbackType.allTypes, Bool()))
})
val statusArrayValid = RegInit(VecInit(Seq.fill(params.numEntries)(false.B)))
......@@ -112,7 +112,7 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
val replayArrayNext = WireInit(replayArray)
replayArray := replayArrayNext
(statusArrayValid zip replayArrayNext).foreach { case (valid, replay) => when(valid === 0.B) { replay := RSFeedbackType.feedbackInvalid } }
io.rsFeedback := VecInit((0 until 5).map(index => statusArrayValid.zip(replayArray).map {
io.rsFeedback := VecInit((0 until RSFeedbackType.allTypes).map(index => statusArrayValid.zip(replayArray).map {
case (valid, replay) => valid && replay === index.U
}.reduce(_ || _)))
......
......@@ -48,6 +48,11 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe
// debug arch ports
val debug_int_rat = Vec(32, Input(UInt(PhyRegIdxWidth.W)))
val debug_fp_rat = Vec(32, Input(UInt(PhyRegIdxWidth.W)))
// perf only
val stallReason = new Bundle {
val in = Flipped(new StallReasonIO(RenameWidth))
val out = new StallReasonIO(RenameWidth)
}
})
// create free list and rat
......@@ -352,6 +357,39 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe
printRenameInfo(x, y)
}
val debugRedirect = RegEnable(io.redirect.bits, io.redirect.valid)
// bad speculation
val recStall = io.redirect.valid || io.robCommits.isWalk
val ctrlRecStall = Mux(io.redirect.valid, io.redirect.bits.debugIsCtrl, io.robCommits.isWalk && debugRedirect.debugIsCtrl)
val mvioRecStall = Mux(io.redirect.valid, io.redirect.bits.debugIsMemVio, io.robCommits.isWalk && debugRedirect.debugIsMemVio)
val otherRecStall = recStall && !(ctrlRecStall || mvioRecStall)
XSPerfAccumulate("recovery_stall", recStall)
XSPerfAccumulate("control_recovery_stall", ctrlRecStall)
XSPerfAccumulate("mem_violation_recovery_stall", mvioRecStall)
XSPerfAccumulate("other_recovery_stall", otherRecStall)
// freelist stall
val notRecStall = !io.out.head.valid && !recStall
val intFlStall = notRecStall && hasValid && !intFreeList.io.canAllocate
val fpFlStall = notRecStall && hasValid && !fpFreeList.io.canAllocate
// other stall
val otherStall = notRecStall && !intFlStall && !fpFlStall
io.stallReason.in.backReason.valid := io.stallReason.out.backReason.valid || !io.in.head.ready
io.stallReason.in.backReason.bits := Mux(io.stallReason.out.backReason.valid, io.stallReason.out.backReason.bits,
MuxCase(TopDownCounters.OtherCoreStall.id.U, Seq(
ctrlRecStall -> TopDownCounters.ControlRecoveryStall.id.U,
mvioRecStall -> TopDownCounters.MemVioRecoveryStall.id.U,
otherRecStall -> TopDownCounters.OtherRecoveryStall.id.U,
intFlStall -> TopDownCounters.IntFlStall.id.U,
fpFlStall -> TopDownCounters.FpFlStall.id.U
)
))
io.stallReason.out.reason.zip(io.stallReason.in.reason).zip(io.in.map(_.valid)).foreach { case ((out, in), valid) =>
out := Mux(io.stallReason.in.backReason.valid,
io.stallReason.in.backReason.bits,
Mux(valid, TopDownCounters.NoStall.id.U, in))
}
XSDebug(io.robCommits.isWalk, p"Walk Recovery Enabled\n")
XSDebug(io.robCommits.isWalk, p"validVec:${Binary(io.robCommits.walkValid.asUInt)}\n")
for (i <- 0 until CommitWidth) {
......@@ -370,13 +408,17 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe
XSPerfAccumulate("stall_cycle_fp", hasValid && io.out(0).ready && !fpFreeList.io.canAllocate && intFreeList.io.canAllocate && !io.robCommits.isWalk)
XSPerfAccumulate("stall_cycle_int", hasValid && io.out(0).ready && fpFreeList.io.canAllocate && !intFreeList.io.canAllocate && !io.robCommits.isWalk)
XSPerfAccumulate("stall_cycle_walk", hasValid && io.out(0).ready && fpFreeList.io.canAllocate && intFreeList.io.canAllocate && io.robCommits.isWalk)
XSPerfAccumulate("recovery_bubbles", PopCount(io.in.map(_.valid && io.out(0).ready && fpFreeList.io.canAllocate && intFreeList.io.canAllocate && io.robCommits.isWalk)))
XSPerfHistogram("slots_fire", PopCount(io.out.map(_.fire)), true.B, 0, RenameWidth+1, 1)
// Explaination: when out(0) not fire, PopCount(valid) is not meaningfull
XSPerfHistogram("slots_valid_pure", PopCount(io.in.map(_.valid)), io.out(0).fire, 0, RenameWidth+1, 1)
XSPerfHistogram("slots_valid_rough", PopCount(io.in.map(_.valid)), true.B, 0, RenameWidth+1, 1)
XSPerfAccumulate("move_instr_count", PopCount(io.out.map(out => out.fire && out.bits.ctrl.isMove)))
val is_fused_lui_load = io.out.map(o => o.fire && o.bits.ctrl.fuType === FuType.ldu && o.bits.ctrl.srcType(0) === SrcType.imm)
XSPerfAccumulate("fused_lui_load_instr_count", PopCount(is_fused_lui_load))
val renamePerf = Seq(
("rename_in ", PopCount(io.in.map(_.valid & io.in(0).ready )) ),
("rename_waitinstr ", PopCount((0 until RenameWidth).map(i => io.in(i).valid && !io.in(i).ready)) ),
......
......@@ -26,14 +26,15 @@ import utility._
import xiangshan._
import xiangshan.backend.exu.ExuConfig
import xiangshan.frontend.FtqPtr
import xiangshan.mem.{LsqEnqIO, LqPtr}
class DebugMdpInfo(implicit p: Parameters) extends XSBundle{
val ssid = UInt(SSIDWidth.W)
val waitAllStore = Bool()
}
class DebugLsInfo(implicit p: Parameters) extends XSBundle{
val s1 = new Bundle{
class DebugLsInfo(implicit p: Parameters) extends XSBundle {
val s1 = new Bundle {
val isTlbFirstMiss = Bool() // in s1
val isBankConflict = Bool() // in s1
val isLoadToLoadForward = Bool()
......@@ -70,7 +71,7 @@ class DebugLsInfo(implicit p: Parameters) extends XSBundle{
}
}
object DebugLsInfo{
object DebugLsInfo {
def init(implicit p: Parameters): DebugLsInfo = {
val lsInfo = Wire(new DebugLsInfo)
lsInfo.s1.isTlbFirstMiss := false.B
......@@ -96,22 +97,35 @@ class DebugLSIO(implicit p: Parameters) extends XSBundle {
val debugLsInfo = Vec(exuParameters.LduCnt + exuParameters.StuCnt, Output(new DebugLsInfoBundle))
}
class DebugInstDB(implicit p: Parameters) extends XSBundle{
val globalID = UInt(XLEN.W)
val robIdx = UInt(log2Ceil(RobSize).W)
val instType = FuType()
val exceptType = ExceptionVec()
val ivaddr = UInt(VAddrBits.W)
val dvaddr = UInt(VAddrBits.W) // the l/s access address
val dpaddr = UInt(VAddrBits.W) // need the physical address when the TLB is valid
val tlbLatency = UInt(XLEN.W) // original requirements is L1toL2TlbLatency
// val levelTlbHit = UInt(2.W) // 01, 10, 11(memory)
// val otherPerfNoteThing // FIXME: how much?
val accessLatency = UInt(XLEN.W) // RS out time --> write back time
val executeLatency = UInt(XLEN.W)
val issueLatency = UInt(XLEN.W)
val lsInfo = new DebugLsInfo
val mdpInfo = new DebugMdpInfo
class LsTopdownInfo(implicit p: Parameters) extends XSBundle {
val s1 = new Bundle {
val robIdx = UInt(log2Ceil(RobSize).W)
val vaddr_valid = Bool()
val vaddr_bits = UInt(VAddrBits.W)
}
val s2 = new Bundle {
val robIdx = UInt(log2Ceil(RobSize).W)
val paddr_valid = Bool()
val paddr_bits = UInt(PAddrBits.W)
}
def s1SignalEnable(ena: LsTopdownInfo) = {
when(ena.s1.vaddr_valid) {
s1.vaddr_valid := true.B
s1.vaddr_bits := ena.s1.vaddr_bits
}
}
def s2SignalEnable(ena: LsTopdownInfo) = {
when(ena.s2.paddr_valid) {
s2.paddr_valid := true.B
s2.paddr_bits := ena.s2.paddr_bits
}
}
}
object LsTopdownInfo {
def init(implicit p: Parameters): LsTopdownInfo = 0.U.asTypeOf(new LsTopdownInfo)
}
class RobPtr(implicit p: Parameters) extends CircularQueuePtr[RobPtr](
......@@ -154,6 +168,10 @@ class RobLsqIO(implicit p: Parameters) extends XSBundle {
val pendingld = Output(Bool())
val pendingst = Output(Bool())
val commit = Output(Bool())
val pendingPtr = Output(new RobPtr)
val mmio = Input(Vec(LoadPipelineWidth, Bool()))
val uop = Input(Vec(LoadPipelineWidth, new MicroOp))
}
class RobEnqIO(implicit p: Parameters) extends XSBundle {
......@@ -165,8 +183,6 @@ class RobEnqIO(implicit p: Parameters) extends XSBundle {
val resp = Vec(RenameWidth, Output(new RobPtr))
}
class RobDispatchData(implicit p: Parameters) extends RobCommitInfo
class RobDeqPtrWrapper(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper {
val io = IO(new Bundle {
// for commits/flush
......@@ -395,9 +411,14 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
val robDeqPtr = Output(new RobPtr)
val csr = new RobCSRIO
val robFull = Output(Bool())
val headNotReady = Output(Bool())
val cpu_halt = Output(Bool())
val wfi_enable = Input(Bool())
val debug_ls = Flipped(new DebugLSIO)
val debugRobHead = Output(new MicroOp)
val debugEnqLsq = Input(new LsqEnqIO)
val debugHeadLsIssue = Input(Bool())
val lsTopdownInfo = Vec(exuParameters.LduCnt, Input(new LsTopdownInfo))
})
def selectWb(index: Int, func: Seq[ExuConfig] => Boolean): Seq[(Seq[ExuConfig], ValidIO[ExuOutput])] = {
......@@ -424,6 +445,7 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
// writeback status
val writebacked = Mem(RobSize, Bool())
val store_data_writebacked = Mem(RobSize, Bool())
val mmio = RegInit(VecInit(Seq.fill(RobSize)(false.B)))
// data for redirect, exception, etc.
val flagBkup = Mem(RobSize, Bool())
// some instructions are not allowed to trigger interrupts
......@@ -436,6 +458,9 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
val debug_exuData = Reg(Vec(RobSize, UInt(XLEN.W)))//for debug
val debug_exuDebug = Reg(Vec(RobSize, new DebugBundle))//for debug
val debug_lsInfo = RegInit(VecInit(Seq.fill(RobSize)(DebugLsInfo.init)))
val debug_lsTopdownInfo = RegInit(VecInit(Seq.fill(RobSize)(LsTopdownInfo.init)))
val debug_lqIdxValid = RegInit(VecInit.fill(RobSize)(false.B))
val debug_lsIssued = RegInit(VecInit.fill(RobSize)(false.B))
// pointers
// For enqueue ptr, we don't duplicate it since only enqueue needs it.
......@@ -452,6 +477,9 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
val isEmpty = enqPtr === deqPtr
val isReplaying = io.redirect.valid && RedirectLevel.flushItself(io.redirect.bits.level)
val debug_lsIssue = WireDefault(debug_lsIssued)
debug_lsIssue(deqPtr.value) := io.debugHeadLsIssue
/**
* states of Rob
*/
......@@ -469,7 +497,7 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
* (1) read: commits/walk/exception
* (2) write: write back from exe units
*/
val dispatchData = Module(new SyncDataModuleTemplate(new RobDispatchData, RobSize, CommitWidth, RenameWidth))
val dispatchData = Module(new SyncDataModuleTemplate(new RobCommitInfo, RobSize, CommitWidth, RenameWidth))
val dispatchDataRead = dispatchData.io.rdata
val exceptionGen = Module(new ExceptionGen)
......@@ -477,6 +505,7 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
val fflagsDataRead = Wire(Vec(CommitWidth, UInt(5.W)))
io.robDeqPtr := deqPtr
io.debugRobHead := debug_microOp(deqPtr.value)
/**
* Enqueue (from dispatch)
......@@ -528,6 +557,9 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
debug_microOp(enqIndex).debugInfo.tlbFirstReqTime := timer
debug_microOp(enqIndex).debugInfo.tlbRespTime := timer
debug_lsInfo(enqIndex) := DebugLsInfo.init
debug_lsTopdownInfo(enqIndex) := LsTopdownInfo.init
debug_lqIdxValid(enqIndex) := false.B
debug_lsIssued(enqIndex) := false.B
when (enqUop.ctrl.blockBackward) {
hasBlockBackward := true.B
}
......@@ -552,6 +584,8 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
when (enqUop.ctrl.isWFI && !enqHasException && !enqHasTriggerHit) {
hasWFI := true.B
}
mmio(enqIndex) := false.B
}
}
val dispatchNum = Mux(io.enq.canAccept, PopCount(io.enq.req.map(_.valid)), 0.U)
......@@ -561,6 +595,19 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
hasWFI := false.B
}
// lqEnq
io.debugEnqLsq.needAlloc.map(_(0)).zip(io.debugEnqLsq.req).foreach { case (alloc, req) =>
when(io.debugEnqLsq.canAccept && alloc && req.valid) {
debug_microOp(req.bits.robIdx.value).lqIdx := req.bits.lqIdx
debug_lqIdxValid(req.bits.robIdx.value) := true.B
}
}
// lsIssue
when(io.debugHeadLsIssue) {
debug_lsIssued(deqPtr.value) := true.B
}
/**
* Writeback (from execution units)
*/
......@@ -591,6 +638,11 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
val writebackNum = PopCount(exuWriteback.map(_.valid))
XSInfo(writebackNum =/= 0.U, "writebacked %d insts\n", writebackNum)
for (i <- 0 until LoadPipelineWidth) {
when (RegNext(io.lsq.mmio(i))) {
mmio(RegNext(io.lsq.uop(i).robIdx).value) := true.B
}
}
/**
* RedirectOut: Interrupt and Exceptions
......@@ -736,9 +788,10 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
io.lsq.lcommit := RegNext(Mux(io.commits.isCommit, PopCount(ldCommitVec), 0.U))
io.lsq.scommit := RegNext(Mux(io.commits.isCommit, PopCount(stCommitVec), 0.U))
// indicate a pending load or store
io.lsq.pendingld := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.LOAD && valid(deqPtr.value))
io.lsq.pendingld := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.LOAD && valid(deqPtr.value) && mmio(deqPtr.value))
io.lsq.pendingst := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.STORE && valid(deqPtr.value))
io.lsq.commit := RegNext(io.commits.isCommit && io.commits.commitValid(0))
io.lsq.pendingPtr := RegNext(deqPtr)
/**
* state changes
......@@ -867,6 +920,10 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
debug_lsInfo(io.debug_ls.debugLsInfo(i).s1_robIdx).s1SignalEnable(io.debug_ls.debugLsInfo(i))
debug_lsInfo(io.debug_ls.debugLsInfo(i).s2_robIdx).s2SignalEnable(io.debug_ls.debugLsInfo(i))
}
for (i <- 0 until exuParameters.LduCnt) {
debug_lsTopdownInfo(io.lsTopdownInfo(i).s1.robIdx).s1SignalEnable(io.lsTopdownInfo(i))
debug_lsTopdownInfo(io.lsTopdownInfo(i).s2.robIdx).s2SignalEnable(io.lsTopdownInfo(i))
}
// status field: writebacked
// enqueue logic set 6 writebacked to false
......@@ -1007,6 +1064,7 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
instrCntReg := instrCnt
io.csr.perfinfo.retiredInstr := retireCounter
io.robFull := !allowEnqueue
io.headNotReady := commit_v.head && !commit_w.head
/**
* debug info
......@@ -1097,18 +1155,36 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
}
}
val sourceVaddr = Wire(Valid(UInt(VAddrBits.W)))
sourceVaddr.valid := debug_lsTopdownInfo(deqPtr.value).s1.vaddr_valid
sourceVaddr.bits := debug_lsTopdownInfo(deqPtr.value).s1.vaddr_bits
val sourcePaddr = Wire(Valid(UInt(PAddrBits.W)))
sourcePaddr.valid := debug_lsTopdownInfo(deqPtr.value).s2.paddr_valid
sourcePaddr.bits := debug_lsTopdownInfo(deqPtr.value).s2.paddr_bits
val sourceLqIdx = Wire(Valid(new LqPtr))
sourceLqIdx.valid := debug_lqIdxValid(deqPtr.value)
sourceLqIdx.bits := debug_microOp(deqPtr.value).lqIdx
val sourceHeadLsIssue = WireDefault(debug_lsIssue(deqPtr.value))
ExcitingUtils.addSource(sourceVaddr, s"rob_head_vaddr_${coreParams.HartId}", ExcitingUtils.Perf, true)
ExcitingUtils.addSource(sourcePaddr, s"rob_head_paddr_${coreParams.HartId}", ExcitingUtils.Perf, true)
ExcitingUtils.addSource(sourceLqIdx, s"rob_head_lqIdx_${coreParams.HartId}", ExcitingUtils.Perf, true)
ExcitingUtils.addSource(sourceHeadLsIssue, s"rob_head_ls_issue_${coreParams.HartId}", ExcitingUtils.Perf, true)
// dummy sink
ExcitingUtils.addSink(WireDefault(sourceLqIdx), s"rob_head_lqIdx_${coreParams.HartId}", ExcitingUtils.Perf)
/**
* DataBase info:
* log trigger is at writeback valid
* */
if(!env.FPGAPlatform){
val instTableName = "InstDB" + p(XSCoreParamsKey).HartId.toString
val isWriteInstInfoTable = WireInit(Constantin.createRecord("isWriteInstInfoTable" + p(XSCoreParamsKey).HartId.toString))
val instTableName = "InstTable" + p(XSCoreParamsKey).HartId.toString
val instSiteName = "Rob" + p(XSCoreParamsKey).HartId.toString
val debug_instTable = ChiselDB.createTable(instTableName, new DebugInstDB)
val debug_instTable = ChiselDB.createTable(instTableName, new InstInfoEntry)
// FIXME lyq: only get inst (alu, bj, ls) in exuWriteback
for (wb <- exuWriteback) {
when(wb.valid) {
val debug_instData = Wire(new DebugInstDB)
val debug_instData = Wire(new InstInfoEntry)
val idx = wb.bits.uop.robIdx.value
debug_instData.globalID := wb.bits.uop.ctrl.debug_globalID
debug_instData.robIdx := idx
......@@ -1120,10 +1196,12 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
debug_instData.accessLatency := wb.bits.uop.debugInfo.writebackTime - wb.bits.uop.debugInfo.issueTime
debug_instData.executeLatency := wb.bits.uop.debugInfo.writebackTime - wb.bits.uop.debugInfo.issueTime
debug_instData.issueLatency := wb.bits.uop.debugInfo.issueTime - wb.bits.uop.debugInfo.selectTime
debug_instData.exceptType := wb.bits.uop.cf.exceptionVec
debug_instData.exceptType := Cat(wb.bits.uop.cf.exceptionVec)
debug_instData.lsInfo := debug_lsInfo(idx)
debug_instData.mdpInfo.ssid := wb.bits.uop.cf.ssid
debug_instData.mdpInfo.waitAllStore := wb.bits.uop.cf.loadWaitStrict && wb.bits.uop.cf.loadWaitBit
debug_instData.issueTime := wb.bits.uop.debugInfo.issueTime
debug_instData.writebackTime := wb.bits.uop.debugInfo.writebackTime
debug_instTable.log(
data = debug_instData,
en = wb.valid,
......
......@@ -27,7 +27,8 @@ import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, Trans
import freechips.rocketchip.tilelink._
import freechips.rocketchip.util.{BundleFieldBase, UIntToOH1}
import device.RAMHelper
import huancun.{AliasField, AliasKey, DirtyField, PreferCacheField, PrefetchField}
import coupledL2.{AliasField, AliasKey, DirtyField, PrefetchField}
import utility.ReqSourceField
import utility.FastArbiter
import mem.{AddPipelineReg}
import xiangshan.cache.dcache.ReplayCarry
......@@ -50,7 +51,7 @@ case class DCacheParameters
nMMIOEntries: Int = 1,
nMMIOs: Int = 1,
blockBytes: Int = 64,
alwaysReleaseData: Boolean = true
alwaysReleaseData: Boolean = false
) extends L1CacheParameters {
// if sets * blockBytes > 4KB(page size),
// cache alias will happen,
......@@ -59,9 +60,9 @@ case class DCacheParameters
val aliasBitsOpt = if(setBytes > pageSize) Some(log2Ceil(setBytes / pageSize)) else None
val reqFields: Seq[BundleFieldBase] = Seq(
PrefetchField(),
PreferCacheField()
ReqSourceField()
) ++ aliasBitsOpt.map(AliasField)
val echoFields: Seq[BundleFieldBase] = Seq(DirtyField())
val echoFields: Seq[BundleFieldBase] = Nil
def tagCode: Code = Code.fromString(tagECC)
......@@ -152,7 +153,7 @@ trait HasDCacheParameters extends HasL1CacheParameters {
val DCacheLineOffset = DCacheSetOffset
// uncache
val uncacheIdxBits = log2Up(StoreQueueSize) max log2Up(LoadQueueSize)
val uncacheIdxBits = log2Up(StoreQueueSize + 1) max log2Up(VirtualLoadQueueSize + 1)
// hardware prefetch parameters
// high confidence hardware prefetch port
val HighConfHWPFLoadPort = LoadPipelineWidth - 1 // use the last load port by default
......@@ -299,18 +300,21 @@ class DCacheExtraMeta(implicit p: Parameters) extends DCacheBundle
}
// memory request in word granularity(load, mmio, lr/sc, atomics)
class DCacheWordReq(implicit p: Parameters) extends DCacheBundle
class DCacheWordReq(implicit p: Parameters) extends DCacheBundle
{
val cmd = UInt(M_SZ.W)
val addr = UInt(PAddrBits.W)
val vaddr = UInt(VAddrBits.W)
val data = UInt(DataBits.W)
val mask = UInt((DataBits/8).W)
val id = UInt(reqIdWidth.W)
val instrtype = UInt(sourceTypeWidth.W)
val isFirstIssue = Bool()
val replayCarry = new ReplayCarry
val debug_robIdx = UInt(log2Ceil(RobSize).W)
def dump() = {
XSDebug("DCacheWordReq: cmd: %x addr: %x data: %x mask: %x id: %d\n",
cmd, addr, data, mask, id)
XSDebug("DCacheWordReq: cmd: %x vaddr: %x data: %x mask: %x id: %d\n",
cmd, vaddr, data, mask, id)
}
}
......@@ -331,7 +335,7 @@ class DCacheLineReq(implicit p: Parameters) extends DCacheBundle
}
class DCacheWordReqWithVaddr(implicit p: Parameters) extends DCacheWordReq {
val vaddr = UInt(VAddrBits.W)
val addr = UInt(PAddrBits.W)
val wline = Bool()
}
......@@ -342,7 +346,6 @@ class BaseDCacheWordResp(implicit p: Parameters) extends DCacheBundle
// select in s3
val data_delayed = UInt(DataBits.W)
val id = UInt(reqIdWidth.W)
// cache req missed, send it to miss queue
val miss = Bool()
// cache miss, and failed to enter the missqueue, replay from RS is needed
......@@ -352,6 +355,7 @@ class BaseDCacheWordResp(implicit p: Parameters) extends DCacheBundle
val tag_error = Bool() // tag error
val mshr_id = UInt(log2Up(cfg.nMissEntries).W)
val debug_robIdx = UInt(log2Ceil(RobSize).W)
def dump() = {
XSDebug("DCacheWordResp: data: %x id: %d miss: %b replay: %b\n",
data, id, miss, replay)
......@@ -362,8 +366,11 @@ class DCacheWordResp(implicit p: Parameters) extends BaseDCacheWordResp
{
val meta_prefetch = Bool()
val meta_access = Bool()
// 1 cycle after data resp
// s2
val handled = Bool()
// s3: 1 cycle after data resp
val error_delayed = Bool() // all kinds of errors, include tag error
val replacementUpdated = Bool()
}
class BankedDCacheWordResp(implicit p: Parameters) extends DCacheWordResp
......@@ -430,6 +437,7 @@ class UncacheWordReq(implicit p: Parameters) extends DCacheBundle
val id = UInt(uncacheIdxBits.W)
val instrtype = UInt(sourceTypeWidth.W)
val atomic = Bool()
val isFirstIssue = Bool()
val replayCarry = new ReplayCarry
def dump() = {
......@@ -450,6 +458,7 @@ class UncacheWorResp(implicit p: Parameters) extends DCacheBundle
val replayCarry = new ReplayCarry
val mshr_id = UInt(log2Up(cfg.nMissEntries).W) // FIXME: why uncacheWordResp is not merged to baseDcacheResp
val debug_robIdx = UInt(log2Ceil(RobSize).W)
def dump() = {
XSDebug("UncacheWordResp: data: %x id: %d miss: %b replay: %b, tag_error: %b, error: %b\n",
data, id, miss, replay, tag_error, error)
......@@ -488,14 +497,17 @@ class DCacheLoadIO(implicit p: Parameters) extends DCacheWordIO
val s1_kill = Output(Bool())
val s2_kill = Output(Bool())
val s2_pc = Output(UInt(VAddrBits.W))
// cycle 0: load has updated replacement before
val replacementUpdated = Output(Bool())
// cycle 0: virtual address: req.addr
// cycle 1: physical address: s1_paddr
val s1_paddr_dup_lsu = Output(UInt(PAddrBits.W)) // lsu side paddr
val s1_paddr_dup_dcache = Output(UInt(PAddrBits.W)) // dcache side paddr
val s1_disable_fast_wakeup = Input(Bool())
val s1_bank_conflict = Input(Bool())
// cycle 2: hit signal
val s2_hit = Input(Bool()) // hit signal for lsu,
val s2_hit = Input(Bool()) // hit signal for lsu,
val s2_first_hit = Input(Bool())
val s2_bank_conflict = Input(Bool())
// debug
val debug_s1_hit_way = Input(UInt(nWays.W))
......@@ -691,7 +703,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
//----------------------------------------
// core data structures
val bankedDataArray = Module(new BankedDataArray)
val bankedDataArray = if(EnableDCacheWPU) Module(new SramedDataArray) else Module(new BankedDataArray)
val metaArray = Module(new L1CohMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
val errorArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
val prefetchArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2)) // prefetch flag array
......@@ -791,6 +803,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
//----------------------------------------
// data array
mainPipe.io.data_read.zip(ldu).map(x => x._1 := x._2.io.lsu.req.valid)
val dataWriteArb = Module(new Arbiter(new L1BankedDataWriteReq, 2))
dataWriteArb.io.in(0) <> refillPipe.io.data_write
......@@ -808,7 +821,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
bankedDataArray.io.write_dup(bank) <> dataWriteArb_dup.io.out
}
bankedDataArray.io.readline <> mainPipe.io.data_read
bankedDataArray.io.readline <> mainPipe.io.data_readline
bankedDataArray.io.readline_intend := mainPipe.io.data_read_intend
mainPipe.io.readline_error_delayed := bankedDataArray.io.readline_error_delayed
mainPipe.io.data_resp := bankedDataArray.io.readline_resp
......@@ -847,6 +860,35 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
bankedDataArray.io.disable_ld_fast_wakeup(w) // load pipe fast wake up should be disabled when bank conflict
}
/** LoadMissDB: record load miss state */
val isWriteLoadMissTable = WireInit(Constantin.createRecord("isWriteLoadMissTable" + p(XSCoreParamsKey).HartId.toString))
val isFirstHitWrite = WireInit(Constantin.createRecord("isFirstHitWrite" + p(XSCoreParamsKey).HartId.toString))
val tableName = "LoadMissDB" + p(XSCoreParamsKey).HartId.toString
val siteName = "DcacheWrapper" + p(XSCoreParamsKey).HartId.toString
val loadMissTable = ChiselDB.createTable(tableName, new LoadMissEntry)
for( i <- 0 until LoadPipelineWidth){
val loadMissEntry = Wire(new LoadMissEntry)
val loadMissWriteEn =
(!ldu(i).io.lsu.resp.bits.replay && ldu(i).io.miss_req.fire) ||
(ldu(i).io.lsu.s2_first_hit && ldu(i).io.lsu.resp.valid && isFirstHitWrite.orR)
loadMissEntry.timeCnt := GTimer()
loadMissEntry.robIdx := ldu(i).io.lsu.resp.bits.debug_robIdx
loadMissEntry.paddr := ldu(i).io.miss_req.bits.addr
loadMissEntry.vaddr := ldu(i).io.miss_req.bits.vaddr
loadMissEntry.missState := OHToUInt(Cat(Seq(
ldu(i).io.miss_req.fire & ldu(i).io.miss_resp.merged,
ldu(i).io.miss_req.fire & !ldu(i).io.miss_resp.merged,
ldu(i).io.lsu.s2_first_hit && ldu(i).io.lsu.resp.valid
)))
loadMissTable.log(
data = loadMissEntry,
en = isWriteLoadMissTable.orR && loadMissWriteEn,
site = siteName,
clock = clock,
reset = reset
)
}
//----------------------------------------
// atomics
// atomics not finished yet
......@@ -1104,7 +1146,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
ld_access.zip(ldu).foreach {
case (a, u) =>
a.valid := RegNext(u.io.lsu.req.fire()) && !u.io.lsu.s1_kill
a.bits.idx := RegNext(get_idx(u.io.lsu.req.bits.addr))
a.bits.idx := RegNext(get_idx(u.io.lsu.req.bits.vaddr))
a.bits.tag := get_tag(u.io.lsu.s1_paddr_dup_dcache)
}
st_access.valid := RegNext(mainPipe.io.store_req.fire())
......
......@@ -21,6 +21,7 @@ import chisel3._
import chisel3.util._
import freechips.rocketchip.tilelink.ClientMetadata
import utils.{HasPerfEvents, XSDebug, XSPerfAccumulate}
import utility.ParallelPriorityMux
import xiangshan.L1CacheErrorInfo
import xiangshan.cache.dcache.{DCacheWPU, IdealWPU}
......@@ -87,11 +88,11 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
val tag_read = io.tag_read.bits
// Tag read for new requests
meta_read.idx := get_idx(io.lsu.req.bits.addr)
meta_read.idx := get_idx(io.lsu.req.bits.vaddr)
meta_read.way_en := ~0.U(nWays.W)
// meta_read.tag := DontCare
tag_read.idx := get_idx(io.lsu.req.bits.addr)
tag_read.idx := get_idx(io.lsu.req.bits.vaddr)
tag_read.way_en := ~0.U(nWays.W)
// Pipeline
......@@ -103,7 +104,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
val s0_valid = io.lsu.req.fire()
val s0_req = io.lsu.req.bits
val s0_fire = s0_valid && s1_ready
val s0_vaddr = s0_req.addr
val s0_vaddr = s0_req.vaddr
val s0_replayCarry = s0_req.replayCarry
assert(RegNext(!(s0_valid && (s0_req.cmd =/= MemoryOpConstants.M_XRD && s0_req.cmd =/= MemoryOpConstants.M_PFR && s0_req.cmd =/= MemoryOpConstants.M_PFW))), "LoadPipe only accepts load req / softprefetch read or write!")
dump_pipeline_reqs("LoadPipe s0", s0_valid, s0_req)
......@@ -119,7 +120,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
val s1_paddr_dup_lsu = io.lsu.s1_paddr_dup_lsu
val s1_paddr_dup_dcache = io.lsu.s1_paddr_dup_dcache
// LSU may update the address from io.lsu.s1_paddr, which affects the bank read enable only.
val s1_vaddr = Cat(s1_req.addr(PAddrBits - 1, blockOffBits), io.lsu.s1_paddr_dup_lsu(blockOffBits - 1, 0))
val s1_vaddr = Cat(s1_req.vaddr(VAddrBits - 1, blockOffBits), io.lsu.s1_paddr_dup_lsu(blockOffBits - 1, 0))
val s1_bank_oh = UIntToOH(addr_to_dcache_bank(s1_vaddr))
val s1_nack = RegNext(io.nack)
val s1_nack_data = !io.banked_data_read.ready
......@@ -195,16 +196,24 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
io.replace_way.set.valid := RegNext(s0_fire)
io.replace_way.set.bits := get_idx(s1_vaddr)
val s1_repl_way_en = UIntToOH(io.replace_way.way)
val s1_repl_tag = Mux1H(s1_repl_way_en, wayMap(w => tag_resp(w)))
val s1_repl_coh = Mux1H(s1_repl_way_en, wayMap(w => meta_resp(w).coh))
val s1_repl_extra_meta = Mux1H(s1_repl_way_en, wayMap(w => io.extra_meta_resp(w)))
val s1_invalid_vec = wayMap(w => !meta_resp(w).coh.isValid())
val s1_have_invalid_way = s1_invalid_vec.asUInt.orR
val s1_invalid_way_en = ParallelPriorityMux(s1_invalid_vec.zipWithIndex.map(x => x._1 -> UIntToOH(x._2.U(nWays.W))))
val s1_repl_way_en_oh = Mux(s1_have_invalid_way, s1_invalid_way_en, UIntToOH(io.replace_way.way))
val s1_repl_way_en_enc = OHToUInt(s1_repl_way_en_oh)
val s1_repl_tag = Mux1H(s1_repl_way_en_oh, wayMap(w => tag_resp(w)))
val s1_repl_coh = Mux1H(s1_repl_way_en_oh, wayMap(w => meta_resp(w).coh))
val s1_repl_extra_meta = Mux1H(s1_repl_way_en_oh, wayMap(w => io.extra_meta_resp(w)))
val s1_need_replacement = !s1_tag_match_dup_dc
val s1_way_en = Mux(s1_need_replacement, s1_repl_way_en, s1_tag_match_way_dup_dc)
val s1_way_en = Mux(s1_need_replacement, s1_repl_way_en_oh, s1_tag_match_way_dup_dc)
val s1_coh = Mux(s1_need_replacement, s1_repl_coh, s1_hit_coh)
val s1_tag = Mux(s1_need_replacement, s1_repl_tag, get_tag(s1_paddr_dup_dcache))
XSPerfAccumulate("load_has_invalid_way_but_select_valid_way", io.replace_way.set.valid && wayMap(w => !meta_resp(w).coh.isValid()).asUInt.orR && s1_need_replacement && s1_repl_coh.isValid())
XSPerfAccumulate("load_using_replacement", io.replace_way.set.valid && s1_need_replacement)
// data read
io.banked_data_read.valid := s1_fire && !s1_nack
io.banked_data_read.bits.addr := s1_vaddr
......@@ -272,7 +281,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
val s2_nack_data = RegEnable(!io.banked_data_read.ready, s1_fire)
val s2_nack = s2_nack_hit || s2_nack_no_mshr || s2_nack_data
// s2 miss merged
val s2_miss_merged = io.miss_req.valid && io.miss_resp.merged
val s2_miss_merged = io.miss_req.fire && !io.miss_req.bits.cancel && io.miss_resp.merged
val s2_bank_addr = addr_to_dcache_bank(s2_paddr)
dontTouch(s2_bank_addr)
......@@ -329,6 +338,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
}
// io.debug_s2_cache_miss := real_miss
resp.bits.miss := real_miss || io.bank_conflict_slow || s2_wpu_pred_fail
io.lsu.s2_first_hit := s2_req.isFirstIssue && s2_hit
// load pipe need replay when there is a bank conflict or wpu predict fail
resp.bits.replay := (resp.bits.miss && (!io.miss_req.fire() || s2_nack)) || io.bank_conflict_slow || s2_wpu_pred_fail
resp.bits.replayCarry.valid := resp.bits.miss
......@@ -337,6 +347,8 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
resp.bits.meta_access := s2_hit_access
resp.bits.tag_error := s2_tag_error // report tag_error in load s2
resp.bits.mshr_id := io.miss_resp.id
resp.bits.handled := io.miss_req.fire && !io.miss_req.bits.cancel && io.miss_resp.handled
resp.bits.debug_robIdx := s2_req.debug_robIdx
XSPerfAccumulate("wpu_pred_fail", s2_wpu_pred_fail && s2_valid)
XSPerfAccumulate("dcache_read_bank_conflict", io.bank_conflict_slow && s2_valid)
......@@ -353,7 +365,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
io.lsu.debug_s1_hit_way := s1_tag_match_way_dup_dc
io.lsu.s1_disable_fast_wakeup := io.disable_ld_fast_wakeup
io.lsu.s1_bank_conflict := io.bank_conflict_fast
io.lsu.s2_bank_conflict := io.bank_conflict_slow
assert(RegNext(s1_ready && s2_ready), "load pipeline should never be blocked")
// --------------------------------------------------------------------------------
......@@ -376,6 +388,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
// error_delayed signal will be used to update uop.exception 1 cycle after load writeback
resp.bits.error_delayed := s3_error && (s3_hit || s3_tag_error) && s3_valid
resp.bits.data_delayed := s3_banked_data_resp_word
resp.bits.replacementUpdated := io.replace_access.valid
// report tag / data / l2 error (with paddr) to bus error unit
io.error := 0.U.asTypeOf(new L1CacheErrorInfo())
......@@ -389,30 +402,39 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
io.error.valid := s3_error && s3_valid
// update plru in s3
val s3_miss_merged = RegNext(s2_miss_merged)
val first_update = RegNext(RegNext(RegNext(!io.lsu.replacementUpdated)))
val hit_update_replace_en = RegNext(s2_valid) && RegNext(!resp.bits.miss)
val miss_update_replace_en = RegNext(io.miss_req.fire) && RegNext(!io.miss_req.bits.cancel) && RegNext(io.miss_resp.handled)
if (!cfg.updateReplaceOn2ndmiss) {
// replacement is only updated on 1st miss
io.replace_access.valid := RegNext(RegNext(
RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) &&
!s2_nack_no_mshr &&
!s2_miss_merged
)
io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.addr)))
io.replace_access.bits.way := RegNext(RegNext(Mux(s1_tag_match_dup_dc, OHToUInt(s1_tag_match_way_dup_dc), io.replace_way.way)))
// io.replace_access.valid := RegNext(RegNext(
// RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) &&
// !s2_nack_no_mshr &&
// !s2_miss_merged
// )
io.replace_access.valid := (hit_update_replace_en || (miss_update_replace_en && !s3_miss_merged)) && first_update
io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.vaddr)))
io.replace_access.bits.way := RegNext(RegNext(Mux(s1_tag_match_dup_dc, OHToUInt(s1_tag_match_way_dup_dc), s1_repl_way_en_enc)))
} else {
// replacement is updated on both 1st and 2nd miss
// timing is worse than !cfg.updateReplaceOn2ndmiss
io.replace_access.valid := RegNext(RegNext(
RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) &&
!s2_nack_no_mshr
)
io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.addr)))
// io.replace_access.valid := RegNext(RegNext(
// RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) &&
// !s2_nack_no_mshr &&
// // replacement is updated on 2nd miss only when this req is firstly issued
// (!s2_miss_merged || s2_req.isFirstIssue)
// )
io.replace_access.valid := (hit_update_replace_en || miss_update_replace_en) && first_update
io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.vaddr)))
io.replace_access.bits.way := RegNext(
Mux(
RegNext(s1_tag_match_dup_dc),
RegNext(OHToUInt(s1_tag_match_way_dup_dc)), // if hit, access hit way in plru
Mux( // if miss
!s2_miss_merged,
RegNext(io.replace_way.way), // 1st fire: access new selected replace way
RegNext(s1_repl_way_en_enc), // 1st fire: access new selected replace way
OHToUInt(io.miss_resp.repl_way_en) // 2nd fire: access replace way selected at miss queue allocate time
)
)
......
......@@ -25,7 +25,7 @@ import freechips.rocketchip.tilelink.TLPermissions._
import freechips.rocketchip.tilelink.{ClientMetadata, ClientStates, TLPermissions}
import utils._
import utility._
import xiangshan.L1CacheErrorInfo
import xiangshan.{L1CacheErrorInfo, XSCoreParamsKey}
class MainPipeReq(implicit p: Parameters) extends DCacheBundle {
val miss = Bool() // only amo miss will refill in main pipe
......@@ -121,8 +121,9 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
val probe_ttob_check_resp = Flipped(ValidIO(new ProbeToBCheckResp))
// data sram
val data_read = Vec(LoadPipelineWidth, Input(Bool()))
val data_read_intend = Output(Bool())
val data_read = DecoupledIO(new L1BankedDataReadLineReq)
val data_readline = DecoupledIO(new L1BankedDataReadLineReq)
val data_resp = Input(Vec(DCacheBanks, new L1BankedDataReadResult()))
val readline_error_delayed = Input(Bool())
val data_write = DecoupledIO(new L1BankedDataWriteReq)
......@@ -181,10 +182,22 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
val s1_ready, s2_ready, s3_ready = Wire(Bool())
// convert store req to main pipe req, and select a req from store and probe
val storeWaitCycles = RegInit(0.U(4.W))
val StoreWaitThreshold = WireInit(12.U(4.W))
val storeWaitTooLong = storeWaitCycles >= StoreWaitThreshold
val loadsAreComing = io.data_read.asUInt.orR
val storeCanAccept = storeWaitTooLong || !loadsAreComing
val store_req = Wire(DecoupledIO(new MainPipeReq))
store_req.bits := (new MainPipeReq).convertStoreReq(io.store_req.bits)
store_req.valid := io.store_req.valid
io.store_req.ready := store_req.ready
store_req.valid := io.store_req.valid && storeCanAccept
io.store_req.ready := store_req.ready && storeCanAccept
when (store_req.fire) { // if wait too long and write success, reset counter.
storeWaitCycles := 0.U
} .elsewhen (storeWaitCycles < StoreWaitThreshold && store_req.valid && !store_req.ready) { // if block store, increase counter.
storeWaitCycles := storeWaitCycles + 1.U
}
// s0: read meta and tag
val req = Wire(DecoupledIO(new MainPipeReq))
......@@ -244,7 +257,7 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
val s1_banked_rmask = RegEnable(s0_banked_rmask, s0_fire)
val s1_banked_store_wmask = RegEnable(banked_store_wmask, s0_fire)
val s1_need_tag = RegEnable(s0_need_tag, s0_fire)
val s1_can_go = s2_ready && (io.data_read.ready || !s1_need_data)
val s1_can_go = s2_ready && (io.data_readline.ready || !s1_need_data)
val s1_fire = s1_valid && s1_can_go
val s1_idx = get_idx(s1_req.vaddr)
......@@ -293,8 +306,19 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
XSPerfAccumulate("replace_unused_prefetch", s1_req.replace && s1_extra_meta.prefetch && !s1_extra_meta.access) // may not be accurate
// replacement policy
val s1_invalid_vec = wayMap(w => !meta_resp(w).asTypeOf(new Meta).coh.isValid())
val s1_have_invalid_way = s1_invalid_vec.asUInt.orR
val s1_invalid_way_en = ParallelPriorityMux(s1_invalid_vec.zipWithIndex.map(x => x._1 -> UIntToOH(x._2.U(nWays.W))))
val s1_repl_way_en = WireInit(0.U(nWays.W))
s1_repl_way_en := Mux(RegNext(s0_fire), UIntToOH(io.replace_way.way), RegNext(s1_repl_way_en))
s1_repl_way_en := Mux(
RegNext(s0_fire),
Mux(
s1_have_invalid_way,
s1_invalid_way_en,
UIntToOH(io.replace_way.way)
),
RegNext(s1_repl_way_en)
)
val s1_repl_tag = Mux1H(s1_repl_way_en, wayMap(w => tag_resp(w)))
val s1_repl_coh = Mux1H(s1_repl_way_en, wayMap(w => meta_resp(w))).asTypeOf(new ClientMetadata)
val s1_miss_tag = Mux1H(s1_req.miss_way_en, wayMap(w => tag_resp(w)))
......@@ -337,6 +361,9 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
)
)
XSPerfAccumulate("store_has_invalid_way_but_select_valid_way", io.replace_way.set.valid && wayMap(w => !meta_resp(w).asTypeOf(new Meta).coh.isValid()).asUInt.orR && s1_need_replacement && s1_repl_coh.isValid())
XSPerfAccumulate("store_using_replacement", io.replace_way.set.valid && s1_need_replacement)
val s1_has_permission = s1_hit_coh.onAccess(s1_req.cmd)._1
val s1_hit = s1_tag_match && s1_has_permission
val s1_pregen_can_go_to_mq = !s1_req.replace && !s1_req.probe && !s1_req.miss && (s1_req.isStore || s1_req.isAMO) && !s1_hit
......@@ -1396,10 +1423,10 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
io.tag_read.bits.way_en := ~0.U(nWays.W)
io.data_read_intend := s1_valid_dup(3) && s1_need_data
io.data_read.valid := s1_valid_dup(4) && s1_need_data
io.data_read.bits.rmask := s1_banked_rmask
io.data_read.bits.way_en := s1_way_en
io.data_read.bits.addr := s1_req_vaddr_dup_for_data_read
io.data_readline.valid := s1_valid_dup(4) && s1_need_data
io.data_readline.bits.rmask := s1_banked_rmask
io.data_readline.bits.way_en := s1_way_en
io.data_readline.bits.addr := s1_req_vaddr_dup_for_data_read
io.miss_req.valid := s2_valid_dup(4) && s2_can_go_to_mq_dup(0)
val miss_req = io.miss_req.bits
......
......@@ -27,8 +27,7 @@ import freechips.rocketchip.tilelink.ClientStates._
import freechips.rocketchip.tilelink.MemoryOpCategories._
import freechips.rocketchip.tilelink.TLPermissions._
import difftest._
import huancun.{AliasKey, DirtyKey, PreferCacheKey, PrefetchKey}
import utility.FastArbiter
import coupledL2.{AliasKey, DirtyKey, PrefetchKey}
import mem.{AddPipelineReg}
import mem.trace._
......@@ -116,6 +115,8 @@ class MissReq(implicit p: Parameters) extends MissReqWoStoreData {
class MissResp(implicit p: Parameters) extends DCacheBundle {
val id = UInt(log2Up(cfg.nMissEntries).W)
// cache miss request is handled by miss queue, either merged or newly allocated
val handled = Bool()
// cache req missed, merged into one of miss queue entries
// i.e. !miss_merged means this access is the first miss for this cacheline
val merged = Bool()
......@@ -251,11 +252,34 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
// whether the pipeline reg has send out an acquire
val acquire_fired_by_pipe_reg = Input(Bool())
val perf_pending_prefetch = Output(Bool())
val perf_pending_normal = Output(Bool())
val rob_head_query = new DCacheBundle {
val vaddr = Input(UInt(VAddrBits.W))
val query_valid = Input(Bool())
val resp = Output(Bool())
def hit(e_vaddr: UInt): Bool = {
require(e_vaddr.getWidth == VAddrBits)
query_valid && vaddr(VAddrBits - 1, DCacheLineOffset) === e_vaddr(VAddrBits - 1, DCacheLineOffset)
}
}
val latency_monitor = new DCacheBundle {
val load_miss_refilling = Output(Bool())
val store_miss_refilling = Output(Bool())
val amo_miss_refilling = Output(Bool())
val pf_miss_refilling = Output(Bool())
}
})
assert(!RegNext(io.primary_valid && !io.primary_ready))
val req = Reg(new MissReqWoStoreData)
val req_primary_fire = Reg(new MissReqWoStoreData) // for perf use
val req_store_mask = Reg(UInt(cfg.blockBytes.W))
val req_valid = RegInit(false.B)
val set = addr_to_dcache_set(req.vaddr)
......@@ -308,6 +332,14 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
val req_handled_by_this_entry = primary_fire || secondary_fire
// for perf use
val secondary_fired = RegInit(false.B)
io.perf_pending_prefetch := req_valid && prefetch && !secondary_fired
io.perf_pending_normal := req_valid && (!prefetch || secondary_fired)
io.rob_head_query.resp := io.rob_head_query.hit(req.vaddr) && req_valid
io.req_handled_by_this_entry := req_handled_by_this_entry
when (release_entry && req_valid) {
......@@ -317,7 +349,9 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
when (io.miss_req_pipe_reg.alloc) {
assert(RegNext(primary_fire), "after 1 cycle of primary_fire, entry will be allocated")
req_valid := true.B
req := miss_req_pipe_reg_bits.toMissReqWoStoreData()
req_primary_fire := miss_req_pipe_reg_bits.toMissReqWoStoreData()
req.addr := get_block_addr(miss_req_pipe_reg_bits.addr)
s_acquire := io.acquire_fired_by_pipe_reg
......@@ -353,6 +387,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
error := false.B
prefetch := input_req_is_prefetch
access := false.B
secondary_fired := false.B
}
when (io.miss_req_pipe_reg.merge) {
......@@ -382,6 +417,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
when (!input_req_is_prefetch) {
access := true.B // when merge non-prefetch req, set access bit
}
secondary_fired := true.B
}
when (io.mem_acquire.fire()) {
......@@ -544,8 +580,20 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
io.mem_acquire.bits.user.lift(AliasKey).foreach( _ := req.vaddr(13, 12))
// trigger prefetch
io.mem_acquire.bits.user.lift(PrefetchKey).foreach(_ := Mux(io.l2_pf_store_only, req.isFromStore, true.B))
// prefer not to cache data in L2 by default
io.mem_acquire.bits.user.lift(PreferCacheKey).foreach(_ := false.B)
// req source
when(prefetch && !secondary_fired) {
io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.L1DataPrefetch.id.U)
}.otherwise {
when(req.isFromStore) {
io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.CPUStoreData.id.U)
}.elsewhen(req.isFromLoad) {
io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.CPULoadData.id.U)
}.elsewhen(req.isFromAMO) {
io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.CPUAtomicData.id.U)
}.otherwise {
io.mem_acquire.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.L1DataPrefetch.id.U)
}
}
require(nSets <= 256)
io.mem_grant.ready := !w_grantlast && s_acquire
......@@ -632,6 +680,12 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
io.forwardInfo.apply(req_valid, req.addr, refill_data_raw, w_grantfirst, w_grantlast)
// refill latency monitor
io.latency_monitor.load_miss_refilling := req_valid && req_primary_fire.isFromLoad && BoolStopWatch(io.mem_acquire.fire, io.mem_grant.fire && !refill_done, true)
io.latency_monitor.store_miss_refilling := req_valid && req_primary_fire.isFromStore && BoolStopWatch(io.mem_acquire.fire, io.mem_grant.fire && !refill_done, true)
io.latency_monitor.amo_miss_refilling := req_valid && req_primary_fire.isFromAMO && BoolStopWatch(io.mem_acquire.fire, io.mem_grant.fire && !refill_done, true)
io.latency_monitor.pf_miss_refilling := req_valid && req_primary_fire.isFromPrefetch && BoolStopWatch(io.mem_acquire.fire, io.mem_grant.fire && !refill_done, true)
XSPerfAccumulate("miss_req_primary", primary_fire)
XSPerfAccumulate("miss_req_merged", secondary_fire)
XSPerfAccumulate("load_miss_penalty_to_use",
......@@ -725,6 +779,7 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
assert(PopCount(Seq(req_pipeline_reg_handled, VecInit(req_mshr_handled_vec).asUInt.orR)) <= 1.U, "miss req will either go to mshr or pipeline reg")
assert(PopCount(req_mshr_handled_vec) <= 1.U, "Only one mshr can handle a req")
io.resp.id := Mux(!req_pipeline_reg_handled, OHToUInt(req_mshr_handled_vec), miss_req_pipe_reg.mshr_id)
io.resp.handled := Cat(req_mshr_handled_vec).orR || req_pipeline_reg_handled
io.resp.merged := merge
io.resp.repl_way_en := Mux(!req_pipeline_reg_handled, Mux1H(secondary_ready_vec, entries.map(_.io.repl_way_en)), miss_req_pipe_reg.req.way_en)
......@@ -856,8 +911,9 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
debug_miss_trace.source := io.req.bits.source
debug_miss_trace.pc := io.req.bits.pc
val isWriteL1MissQMissTable = WireInit(Constantin.createRecord("isWriteL1MissQMissTable" + p(XSCoreParamsKey).HartId.toString))
val table = ChiselDB.createTable("L1MissQMissTrace_hart"+ p(XSCoreParamsKey).HartId.toString, new L1MissTrace)
table.log(debug_miss_trace, io.req.valid && !io.req.bits.cancel && alloc, "MissQueue", clock, reset)
table.log(debug_miss_trace, isWriteL1MissQMissTable.orR && io.req.valid && !io.req.bits.cancel && alloc, "MissQueue", clock, reset)
// Difftest
if (env.EnableDifftest) {
......@@ -871,13 +927,16 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
}
// Perf count
XSPerfAccumulate("miss_req", io.req.fire())
XSPerfAccumulate("miss_req_allocate", io.req.fire() && alloc)
XSPerfAccumulate("miss_req_merge_load", io.req.fire() && merge && io.req.bits.isFromLoad)
XSPerfAccumulate("miss_req_reject_load", io.req.valid && reject && io.req.bits.isFromLoad)
XSPerfAccumulate("miss_req", io.req.fire() && !io.req.bits.cancel)
XSPerfAccumulate("miss_req_allocate", io.req.fire() && !io.req.bits.cancel && alloc)
XSPerfAccumulate("miss_req_load_allocate", io.req.fire() && !io.req.bits.cancel && alloc && io.req.bits.isFromLoad)
XSPerfAccumulate("miss_req_store_allocate", io.req.fire() && !io.req.bits.cancel && alloc && io.req.bits.isFromStore)
XSPerfAccumulate("miss_req_amo_allocate", io.req.fire() && !io.req.bits.cancel && alloc && io.req.bits.isFromAMO)
XSPerfAccumulate("miss_req_merge_load", io.req.fire() && !io.req.bits.cancel && merge && io.req.bits.isFromLoad)
XSPerfAccumulate("miss_req_reject_load", io.req.valid && !io.req.bits.cancel && reject && io.req.bits.isFromLoad)
XSPerfAccumulate("probe_blocked_by_miss", io.probe_block)
XSPerfAccumulate("prefetch_primary_fire", io.req.fire() && alloc && io.req.bits.isFromPrefetch)
XSPerfAccumulate("prefetch_secondary_fire", io.req.fire() && merge && io.req.bits.isFromPrefetch)
XSPerfAccumulate("prefetch_primary_fire", io.req.fire() && !io.req.bits.cancel && alloc && io.req.bits.isFromPrefetch)
XSPerfAccumulate("prefetch_secondary_fire", io.req.fire() && !io.req.bits.cancel && merge && io.req.bits.isFromPrefetch)
val max_inflight = RegInit(0.U((log2Up(cfg.nMissEntries) + 1).W))
val num_valids = PopCount(~Cat(primary_ready_vec).asUInt)
when (num_valids > max_inflight) {
......@@ -889,6 +948,32 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
io.full := num_valids === cfg.nMissEntries.U
XSPerfHistogram("num_valids", num_valids, true.B, 0, cfg.nMissEntries, 1)
XSPerfHistogram("L1DMLP_CPUData", PopCount(VecInit(entries.map(_.io.perf_pending_normal)).asUInt), true.B, 0, cfg.nMissEntries, 1)
XSPerfHistogram("L1DMLP_Prefetch", PopCount(VecInit(entries.map(_.io.perf_pending_prefetch)).asUInt), true.B, 0, cfg.nMissEntries, 1)
XSPerfHistogram("L1DMLP_Total", num_valids, true.B, 0, cfg.nMissEntries, 1)
XSPerfAccumulate("miss_load_refill_latency", PopCount(entries.map(_.io.latency_monitor.load_miss_refilling)))
XSPerfAccumulate("miss_store_refill_latency", PopCount(entries.map(_.io.latency_monitor.store_miss_refilling)))
XSPerfAccumulate("miss_amo_refill_latency", PopCount(entries.map(_.io.latency_monitor.amo_miss_refilling)))
XSPerfAccumulate("miss_pf_refill_latency", PopCount(entries.map(_.io.latency_monitor.pf_miss_refilling)))
val rob_head_miss_in_dcache = VecInit(entries.map(_.io.rob_head_query.resp)).asUInt.orR
val sourceVaddr = WireInit(0.U.asTypeOf(new Valid(UInt(VAddrBits.W))))
val lq_doing_other_replay = WireInit(false.B)
ExcitingUtils.addSink(sourceVaddr, s"rob_head_vaddr_${coreParams.HartId}", ExcitingUtils.Perf)
ExcitingUtils.addSink(lq_doing_other_replay, s"rob_head_other_replay_${coreParams.HartId}", ExcitingUtils.Perf)
entries.foreach {
case e => {
e.io.rob_head_query.query_valid := sourceVaddr.valid
e.io.rob_head_query.vaddr := sourceVaddr.bits
}
}
// ExcitingUtils.addSource(!rob_head_miss_in_dcache && !lq_doing_other_replay, s"load_l1_cache_stall_without_bank_conflict_${coreParams.HartId}", ExcitingUtils.Perf, true)
ExcitingUtils.addSource(rob_head_miss_in_dcache, s"load_l1_miss_${coreParams.HartId}", ExcitingUtils.Perf, true)
val perfValidCount = RegNext(PopCount(entries.map(entry => (!entry.io.primary_ready))))
val perfEvents = Seq(
("dcache_missq_req ", io.req.fire()),
......
......@@ -230,6 +230,7 @@ class ProbeQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule w
when (io.lrsc_locked_block.valid) {
XSDebug("lrsc_locked_block: %x\n", io.lrsc_locked_block.bits)
}
XSPerfAccumulate("ProbeL1DCache", io.mem_probe.fire)
val perfValidCount = RegNext(PopCount(entries.map(e => e.io.block_addr.valid)))
val perfEvents = Seq(
......
......@@ -81,9 +81,9 @@ class RefillPipe(implicit p: Parameters) extends DCacheModule {
})
// Assume that write in refill pipe is always ready
assert(RegNext(io.data_write.ready))
assert(RegNext(io.meta_write.ready))
assert(RegNext(io.tag_write.ready))
// assert(RegNext(io.data_write.ready))
// assert(RegNext(io.meta_write.ready))
// assert(RegNext(io.tag_write.ready))
val refill_w_valid = io.req.valid
val refill_w_req = io.req.bits
......
......@@ -21,7 +21,7 @@ import chisel3._
import chisel3.util._
import freechips.rocketchip.tilelink.TLPermissions._
import freechips.rocketchip.tilelink.{TLArbiter, TLBundleC, TLBundleD, TLEdgeOut}
import huancun.DirtyKey
import coupledL2.DirtyKey
import utils.{HasPerfEvents, HasTLDump, XSDebug, XSPerfAccumulate}
class WritebackReqCtrl(implicit p: Parameters) extends DCacheBundle {
......@@ -210,7 +210,8 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
// --------------------------------------------------------------------------------
// s_invalid: receive requests
// new req entering
when (io.req.valid && io.primary_valid && io.primary_ready) {
val alloc = io.req.valid && io.primary_valid && io.primary_ready
when (alloc) {
assert (remain === 0.U)
req := io.req.bits
s_data_override := false.B
......@@ -313,7 +314,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
data = beat_data(beat)
)._2
voluntaryReleaseData.echo.lift(DirtyKey).foreach(_ := req.dirty)
// voluntaryReleaseData.echo.lift(DirtyKey).foreach(_ := req.dirty)
when(busy) {
assert(!req.dirty || req.hasData)
}
......@@ -517,7 +518,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
data := mergeData(data, io.release_update.bits.data_delayed, io.release_update.bits.mask_delayed)
}
when (!s_data_override && req.hasData) {
when (!s_data_override && (req.hasData || RegNext(alloc))) {
data := io.req_data.data
}
......
......@@ -148,46 +148,38 @@ class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends DCacheM
val rdataEcc_dup_vec = Seq(3, 4, 5)
val wdata_dup_vec = Seq(6, 7, 8)
val wdataEcc_dup_vec = Seq(9, 10, 11)
for(dupIdx <- rdata_dup_vec) {
for(idx <- 0 until readPorts){
when(io.cacheOp_req_dup(dupIdx).valid && isReadTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
array(idx).io.read.valid := true.B
array(idx).io.read.bits.idx := io.cacheOp.req.bits.index
array(idx).io.read.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
cacheOpShouldResp := true.B
}
rdata_dup_vec.zipWithIndex.map{ case(dupIdx, idx) =>
when(io.cacheOp_req_dup(dupIdx).valid && isReadTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
array(idx).io.read.valid := true.B
array(idx).io.read.bits.idx := io.cacheOp.req.bits.index
array(idx).io.read.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
cacheOpShouldResp := true.B
}
}
for (dupIdx <- rdataEcc_dup_vec) {
for (idx <- 0 until readPorts) {
when(io.cacheOp_req_dup(dupIdx).valid && isReadTagECC(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
array(idx).io.ecc_read.valid := true.B
array(idx).io.ecc_read.bits.idx := io.cacheOp.req.bits.index
array(idx).io.ecc_read.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
cacheOpShouldResp := true.B
}
rdataEcc_dup_vec.zipWithIndex.map{ case(dupIdx, idx) =>
when(io.cacheOp_req_dup(dupIdx).valid && isReadTagECC(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
array(idx).io.ecc_read.valid := true.B
array(idx).io.ecc_read.bits.idx := io.cacheOp.req.bits.index
array(idx).io.ecc_read.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
cacheOpShouldResp := true.B
}
}
for (dupIdx <- wdata_dup_vec) {
for (idx <- 0 until readPorts) {
when(io.cacheOp_req_dup(dupIdx).valid && isWriteTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
array(idx).io.write.valid := true.B
array(idx).io.write.bits.idx := io.cacheOp.req.bits.index
array(idx).io.write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
array(idx).io.write.bits.tag := io.cacheOp.req.bits.write_tag_low
cacheOpShouldResp := true.B
}
wdata_dup_vec.zipWithIndex.map{ case(dupIdx, idx) =>
when(io.cacheOp_req_dup(dupIdx).valid && isWriteTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
array(idx).io.write.valid := true.B
array(idx).io.write.bits.idx := io.cacheOp.req.bits.index
array(idx).io.write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
array(idx).io.write.bits.tag := io.cacheOp.req.bits.write_tag_low
cacheOpShouldResp := true.B
}
}
for (dupIdx <- wdataEcc_dup_vec) {
for (idx <- 0 until readPorts) {
when(io.cacheOp_req_dup(dupIdx).valid && isWriteTagECC(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
array(idx).io.ecc_write.valid := true.B
array(idx).io.ecc_write.bits.idx := io.cacheOp.req.bits.index
array(idx).io.ecc_write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
array(idx).io.ecc_write.bits.ecc := io.cacheOp.req.bits.write_tag_ecc
cacheOpShouldResp := true.B
}
wdataEcc_dup_vec.zipWithIndex.map{ case(dupIdx, idx) =>
when(io.cacheOp_req_dup(dupIdx).valid && isWriteTagECC(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
array(idx).io.ecc_write.valid := true.B
array(idx).io.ecc_write.bits.idx := io.cacheOp.req.bits.index
array(idx).io.ecc_write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
array(idx).io.ecc_write.bits.ecc := io.cacheOp.req.bits.write_tag_ecc
cacheOpShouldResp := true.B
}
}
......
......@@ -29,7 +29,6 @@ import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp}
import freechips.rocketchip.tilelink._
import xiangshan.backend.fu.{PMP, PMPChecker, PMPReqBundle, PMPRespBundle}
import xiangshan.backend.fu.util.HasCSRConst
import utility.ChiselDB
import difftest._
class L2TLB()(implicit p: Parameters) extends LazyModule with HasPtwConst {
......@@ -38,7 +37,8 @@ class L2TLB()(implicit p: Parameters) extends LazyModule with HasPtwConst {
clients = Seq(TLMasterParameters.v1(
"ptw",
sourceId = IdRange(0, MemReqWidth)
))
)),
requestFields = Seq(ReqSourceField())
)))
lazy val module = new L2TLBImp(this)
......@@ -136,10 +136,11 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
prefetch.io.csr := csr_dup(0)
arb2.io.in(InArbPrefetchPort) <> prefetch.io.out
val isWriteL2TlbPrefetchTable = WireInit(Constantin.createRecord("isWriteL2TlbPrefetchTable" + p(XSCoreParamsKey).HartId.toString))
val L2TlbPrefetchTable = ChiselDB.createTable("L2TlbPrefetch_hart" + p(XSCoreParamsKey).HartId.toString, new L2TlbPrefetchDB)
val L2TlbPrefetchDB = Wire(new L2TlbPrefetchDB)
L2TlbPrefetchDB.vpn := prefetch.io.out.bits.vpn
L2TlbPrefetchTable.log(L2TlbPrefetchDB, prefetch.io.out.fire, "L2TlbPrefetch", clock, reset)
L2TlbPrefetchTable.log(L2TlbPrefetchDB, isWriteL2TlbPrefetchTable.orR && prefetch.io.out.fire, "L2TlbPrefetch", clock, reset)
}
arb2.io.out.ready := cache.io.req.ready
......@@ -252,6 +253,7 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
)._2
mem.a.bits := memRead
mem.a.valid := mem_arb.io.out.valid && !flush
mem.a.bits.user.lift(ReqSourceKey).foreach(_ := MemReqSource.PTW.id.U)
mem.d.ready := true.B
// mem -> data buffer
val refill_data = Reg(Vec(blockBits / l1BusDataWidth, UInt(l1BusDataWidth.W)))
......@@ -443,6 +445,7 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
ptw_sector_resp.af := pte.entry(OHToUInt(pte.pteidx)).af
ptw_sector_resp.pf := pte.entry(OHToUInt(pte.pteidx)).pf
ptw_sector_resp.addr_low := OHToUInt(pte.pteidx)
ptw_sector_resp.pteidx := pte.pteidx
for (i <- 0 until tlbcontiguous) {
val ppn_equal = pte.entry(i).ppn === pte.entry(OHToUInt(pte.pteidx)).ppn
val perm_equal = pte.entry(i).perm.getOrElse(0.U.asTypeOf(new PtePermBundle)).asUInt === pte.entry(OHToUInt(pte.pteidx)).perm.getOrElse(0.U.asTypeOf(new PtePermBundle)).asUInt
......@@ -496,17 +499,19 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
val perfEvents = Seq(llptw, cache, ptw).flatMap(_.getPerfEvents)
generatePerfEvent()
val isWriteL1TlbTable = WireInit(Constantin.createRecord("isWriteL1TlbTable" + p(XSCoreParamsKey).HartId.toString))
val L1TlbTable = ChiselDB.createTable("L1Tlb_hart" + p(XSCoreParamsKey).HartId.toString, new L1TlbDB)
val ITlbReqDB, DTlbReqDB, ITlbRespDB, DTlbRespDB = Wire(new L1TlbDB)
ITlbReqDB.vpn := io.tlb(0).req(0).bits.vpn
DTlbReqDB.vpn := io.tlb(1).req(0).bits.vpn
ITlbRespDB.vpn := io.tlb(0).resp.bits.entry.tag
DTlbRespDB.vpn := io.tlb(1).resp.bits.entry.tag
L1TlbTable.log(ITlbReqDB, io.tlb(0).req(0).fire, "ITlbReq", clock, reset)
L1TlbTable.log(DTlbReqDB, io.tlb(1).req(0).fire, "DTlbReq", clock, reset)
L1TlbTable.log(ITlbRespDB, io.tlb(0).resp.fire, "ITlbResp", clock, reset)
L1TlbTable.log(DTlbRespDB, io.tlb(1).resp.fire, "DTlbResp", clock, reset)
L1TlbTable.log(ITlbReqDB, isWriteL1TlbTable.orR && io.tlb(0).req(0).fire, "ITlbReq", clock, reset)
L1TlbTable.log(DTlbReqDB, isWriteL1TlbTable.orR && io.tlb(1).req(0).fire, "DTlbReq", clock, reset)
L1TlbTable.log(ITlbRespDB, isWriteL1TlbTable.orR && io.tlb(0).resp.fire, "ITlbResp", clock, reset)
L1TlbTable.log(DTlbRespDB, isWriteL1TlbTable.orR && io.tlb(1).resp.fire, "DTlbResp", clock, reset)
val isWritePageCacheTable = WireInit(Constantin.createRecord("isWritePageCacheTable" + p(XSCoreParamsKey).HartId.toString))
val PageCacheTable = ChiselDB.createTable("PageCache_hart" + p(XSCoreParamsKey).HartId.toString, new PageCacheDB)
val PageCacheDB = Wire(new PageCacheDB)
PageCacheDB.vpn := Cat(cache.io.resp.bits.toTlb.entry(0).tag, OHToUInt(cache.io.resp.bits.toTlb.pteidx))
......@@ -518,8 +523,9 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
PageCacheDB.l2Hit := cache.io.resp.bits.toFsm.l2Hit
PageCacheDB.l1Hit := cache.io.resp.bits.toFsm.l1Hit
PageCacheDB.hit := cache.io.resp.bits.hit
PageCacheTable.log(PageCacheDB, cache.io.resp.fire, "PageCache", clock, reset)
PageCacheTable.log(PageCacheDB, isWritePageCacheTable.orR && cache.io.resp.fire, "PageCache", clock, reset)
val isWritePTWTable = WireInit(Constantin.createRecord("isWritePTWTable" + p(XSCoreParamsKey).HartId.toString))
val PTWTable = ChiselDB.createTable("PTW_hart" + p(XSCoreParamsKey).HartId.toString, new PTWDB)
val PTWReqDB, PTWRespDB, LLPTWReqDB, LLPTWRespDB = Wire(new PTWDB)
PTWReqDB.vpn := ptw.io.req.bits.req_info.vpn
......@@ -530,17 +536,18 @@ class L2TLBImp(outer: L2TLB)(implicit p: Parameters) extends PtwModule(outer) wi
LLPTWReqDB.source := llptw.io.in.bits.req_info.source
LLPTWRespDB.vpn := llptw.io.mem.refill.vpn
LLPTWRespDB.source := llptw.io.mem.refill.source
PTWTable.log(PTWReqDB, ptw.io.req.fire, "PTWReq", clock, reset)
PTWTable.log(PTWRespDB, ptw.io.mem.resp.fire, "PTWResp", clock, reset)
PTWTable.log(LLPTWReqDB, llptw.io.in.fire, "LLPTWReq", clock, reset)
PTWTable.log(LLPTWRespDB, llptw.io.mem.resp.fire, "LLPTWResp", clock, reset)
PTWTable.log(PTWReqDB, isWritePTWTable.orR && ptw.io.req.fire, "PTWReq", clock, reset)
PTWTable.log(PTWRespDB, isWritePTWTable.orR && ptw.io.mem.resp.fire, "PTWResp", clock, reset)
PTWTable.log(LLPTWReqDB, isWritePTWTable.orR && llptw.io.in.fire, "LLPTWReq", clock, reset)
PTWTable.log(LLPTWRespDB, isWritePTWTable.orR && llptw.io.mem.resp.fire, "LLPTWResp", clock, reset)
val isWriteL2TlbMissQueueTable = WireInit(Constantin.createRecord("isWriteL2TlbMissQueueTable" + p(XSCoreParamsKey).HartId.toString))
val L2TlbMissQueueTable = ChiselDB.createTable("L2TlbMissQueue_hart" + p(XSCoreParamsKey).HartId.toString, new L2TlbMissQueueDB)
val L2TlbMissQueueInDB, L2TlbMissQueueOutDB = Wire(new L2TlbMissQueueDB)
L2TlbMissQueueInDB.vpn := missQueue.io.in.bits.vpn
L2TlbMissQueueOutDB.vpn := missQueue.io.out.bits.vpn
L2TlbMissQueueTable.log(L2TlbMissQueueInDB, missQueue.io.in.fire, "L2TlbMissQueueIn", clock, reset)
L2TlbMissQueueTable.log(L2TlbMissQueueOutDB, missQueue.io.out.fire, "L2TlbMissQueueOut", clock, reset)
L2TlbMissQueueTable.log(L2TlbMissQueueInDB, isWriteL2TlbMissQueueTable.orR && missQueue.io.in.fire, "L2TlbMissQueueIn", clock, reset)
L2TlbMissQueueTable.log(L2TlbMissQueueOutDB, isWriteL2TlbMissQueueTable.orR && missQueue.io.out.fire, "L2TlbMissQueueOut", clock, reset)
}
/** BlockHelper, block missqueue, not to send too many req to cache
......
......@@ -81,6 +81,41 @@ class TlbPermBundle(implicit p: Parameters) extends TlbBundle {
val w = Bool()
val r = Bool()
val pm = new TlbPMBundle
def apply(item: PtwSectorResp, pm: PMPConfig) = {
val ptePerm = item.entry.perm.get.asTypeOf(new PtePermBundle().cloneType)
this.pf := item.pf
this.af := item.af
this.d := ptePerm.d
this.a := ptePerm.a
this.g := ptePerm.g
this.u := ptePerm.u
this.x := ptePerm.x
this.w := ptePerm.w
this.r := ptePerm.r
this.pm.assign_ap(pm)
this
}
override def toPrintable: Printable = {
p"pf:${pf} af:${af} d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r} " +
p"pm:${pm}"
}
}
class TlbSectorPermBundle(implicit p: Parameters) extends TlbBundle {
val pf = Bool() // NOTE: if this is true, just raise pf
val af = Bool() // NOTE: if this is true, just raise af
// pagetable perm (software defined)
val d = Bool()
val a = Bool()
val g = Bool()
val u = Bool()
val x = Bool()
val w = Bool()
val r = Bool()
// static pmp & pma check has a minimum grain size of 4K
// So sector tlb will use eight static pm entries
val pm = Vec(tlbcontiguous, new TlbPMBundle)
......@@ -140,6 +175,95 @@ class CAMTemplate[T <: Data](val gen: T, val set: Int, val readWidth: Int)(impli
class TlbEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters) extends TlbBundle {
require(pageNormal || pageSuper)
val tag = if (!pageNormal) UInt((vpnLen - vpnnLen).W)
else UInt(vpnLen.W)
val asid = UInt(asidLen.W)
val level = if (!pageNormal) Some(UInt(1.W))
else if (!pageSuper) None
else Some(UInt(2.W))
val ppn = if (!pageNormal) UInt((ppnLen - vpnnLen).W)
else UInt(ppnLen.W)
val perm = new TlbPermBundle
/** level usage:
* !PageSuper: page is only normal, level is None, match all the tag
* !PageNormal: page is only super, level is a Bool(), match high 9*2 parts
* bits0 0: need mid 9bits
* 1: no need mid 9bits
* PageSuper && PageNormal: page hold all the three type,
* bits0 0: need low 9bits
* bits1 0: need mid 9bits
*/
def hit(vpn: UInt, asid: UInt, nSets: Int = 1, ignoreAsid: Boolean = false): Bool = {
val asid_hit = if (ignoreAsid) true.B else (this.asid === asid)
// NOTE: for timing, dont care low set index bits at hit check
// do not need store the low bits actually
if (!pageSuper) asid_hit && drop_set_equal(vpn, tag, nSets)
else if (!pageNormal) {
val tag_match_hi = tag(vpnnLen*2-1, vpnnLen) === vpn(vpnnLen*3-1, vpnnLen*2)
val tag_match_mi = tag(vpnnLen-1, 0) === vpn(vpnnLen*2-1, vpnnLen)
val tag_match = tag_match_hi && (level.get.asBool() || tag_match_mi)
asid_hit && tag_match
}
else {
val tmp_level = level.get
val tag_match_hi = tag(vpnnLen*3-1, vpnnLen*2) === vpn(vpnnLen*3-1, vpnnLen*2)
val tag_match_mi = tag(vpnnLen*2-1, vpnnLen) === vpn(vpnnLen*2-1, vpnnLen)
val tag_match_lo = tag(vpnnLen-1, 0) === vpn(vpnnLen-1, 0) // if pageNormal is false, this will always be false
val tag_match = tag_match_hi && (tmp_level(1) || tag_match_mi) && (tmp_level(0) || tag_match_lo)
asid_hit && tag_match
}
}
def apply(item: PtwSectorResp, asid: UInt, pm: PMPConfig): TlbEntry = {
this.tag := {if (pageNormal) Cat(item.entry.tag, OHToUInt(item.pteidx)) else item.entry.tag(sectorvpnLen - 1, vpnnLen - sectortlbwidth)}
this.asid := asid
val inner_level = item.entry.level.getOrElse(0.U)
this.level.map(_ := { if (pageNormal && pageSuper) MuxLookup(inner_level, 0.U, Seq(
0.U -> 3.U,
1.U -> 1.U,
2.U -> 0.U ))
else if (pageSuper) ~inner_level(0)
else 0.U })
this.ppn := { if (!pageNormal) item.entry.ppn(sectorppnLen - 1, vpnnLen - sectortlbwidth)
else Cat(item.entry.ppn, item.ppn_low(OHToUInt(item.pteidx))) }
this.perm.apply(item, pm)
this
}
// 4KB is normal entry, 2MB/1GB is considered as super entry
def is_normalentry(): Bool = {
if (!pageSuper) { true.B }
else if (!pageNormal) { false.B }
else { level.get === 0.U }
}
def genPPN(saveLevel: Boolean = false, valid: Bool = false.B)(vpn: UInt) : UInt = {
val inner_level = level.getOrElse(0.U)
val ppn_res = if (!pageSuper) ppn
else if (!pageNormal) Cat(ppn(ppnLen-vpnnLen-1, vpnnLen),
Mux(inner_level(0), vpn(vpnnLen*2-1, vpnnLen), ppn(vpnnLen-1,0)),
vpn(vpnnLen-1, 0))
else Cat(ppn(ppnLen-1, vpnnLen*2),
Mux(inner_level(1), vpn(vpnnLen*2-1, vpnnLen), ppn(vpnnLen*2-1, vpnnLen)),
Mux(inner_level(0), vpn(vpnnLen-1, 0), ppn(vpnnLen-1, 0)))
if (saveLevel) Cat(ppn(ppn.getWidth-1, vpnnLen*2), RegEnable(ppn_res(vpnnLen*2-1, 0), valid))
else ppn_res
}
override def toPrintable: Printable = {
val inner_level = level.getOrElse(2.U)
p"asid: ${asid} level:${inner_level} vpn:${Hexadecimal(tag)} ppn:${Hexadecimal(ppn)} perm:${perm}"
}
}
class TlbSectorEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters) extends TlbBundle {
require(pageNormal || pageSuper)
val tag = if (!pageNormal) UInt((vpnLen - vpnnLen).W)
else UInt(sectorvpnLen.W)
val asid = UInt(asidLen.W)
......@@ -148,8 +272,9 @@ class TlbEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters)
else Some(UInt(2.W))
val ppn = if (!pageNormal) UInt((ppnLen - vpnnLen).W)
else UInt(sectorppnLen.W)
val perm = new TlbPermBundle
val perm = new TlbSectorPermBundle
val valididx = Vec(tlbcontiguous, Bool())
val pteidx = Vec(tlbcontiguous, Bool())
val ppn_low = Vec(tlbcontiguous, UInt(sectortlbwidth.W))
/** level usage:
......@@ -224,7 +349,7 @@ class TlbEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters)
vpn_hit && index_hit.reduce(_ || _) && PopCount(data.valididx) === 1.U
}
def apply(item: PtwSectorResp, asid: UInt, pm: Seq[PMPConfig]): TlbEntry = {
def apply(item: PtwSectorResp, asid: UInt, pm: Seq[PMPConfig]): TlbSectorEntry = {
this.tag := {if (pageNormal) item.entry.tag else item.entry.tag(sectorvpnLen - 1, vpnnLen - sectortlbwidth)}
this.asid := asid
val inner_level = item.entry.level.getOrElse(0.U)
......@@ -239,6 +364,7 @@ class TlbEntry(pageNormal: Boolean, pageSuper: Boolean)(implicit p: Parameters)
this.perm.apply(item, pm)
this.ppn_low := item.ppn_low
this.valididx := item.valididx
this.pteidx := item.pteidx
this
}
......@@ -302,7 +428,7 @@ class TlbStorageIO(nSets: Int, nWays: Int, ports: Int, nDups: Int = 1)(implicit
val resp = Vec(ports, ValidIO(new Bundle{
val hit = Output(Bool())
val ppn = Vec(nDups, Output(UInt(ppnLen.W)))
val perm = Vec(nDups, Output(new TlbPermBundle()))
val perm = Vec(nDups, Output(new TlbSectorPermBundle()))
}))
}
val w = Flipped(ValidIO(new Bundle {
......@@ -412,8 +538,8 @@ class MemBlockidxBundle(implicit p: Parameters) extends TlbBundle {
val is_ld = Bool()
val is_st = Bool()
val idx =
if (LoadQueueSize >= StoreQueueSize) {
val idx = UInt(log2Ceil(LoadQueueSize).W)
if (VirtualLoadQueueSize >= StoreQueueSize) {
val idx = UInt(log2Ceil(VirtualLoadQueueSize).W)
idx
} else {
val idx = UInt(log2Ceil(StoreQueueSize).W)
......@@ -866,6 +992,7 @@ class PtwSectorResp(implicit p: Parameters) extends PtwBundle {
val addr_low = UInt(sectortlbwidth.W)
val ppn_low = Vec(tlbcontiguous, UInt(sectortlbwidth.W))
val valididx = Vec(tlbcontiguous, Bool())
val pteidx = Vec(tlbcontiguous, Bool())
val pf = Bool()
val af = Bool()
......
......@@ -67,7 +67,7 @@ case class L2TLBParameters
spSize: Int = 16,
spReplacer: Option[String] = Some("plru"),
// filter
ifilterSize: Int = 4,
ifilterSize: Int = 8,
dfilterSize: Int = 8,
// miss queue, add more entries than 'must require'
// 0 for easier bug trigger, please set as big as u can, 8 maybe
......
......@@ -133,6 +133,7 @@ class PTWRepeaterNB(Width: Int = 1, passReady: Boolean = false, FenceDelay: Int)
class PTWFilterIO(Width: Int)(implicit p: Parameters) extends MMUIOBaseBundle {
val tlb = Flipped(new VectorTlbPtwIO(Width))
val ptw = new TlbPtwIO()
val rob_head_miss_in_tlb = Output(Bool())
def apply(tlb: VectorTlbPtwIO, ptw: TlbPtwIO, sfence: SfenceBundle, csr: TlbCsrBundle): Unit = {
this.tlb <> tlb
......@@ -247,6 +248,7 @@ class PTWFilter(Width: Int, Size: Int, FenceDelay: Int)(implicit p: Parameters)
io.tlb.resp.bits.data.addr_low := ptwResp.addr_low
io.tlb.resp.bits.data.ppn_low := ptwResp.ppn_low
io.tlb.resp.bits.data.valididx := ptwResp.valididx
io.tlb.resp.bits.data.pteidx := ptwResp.pteidx
io.tlb.resp.bits.data.pf := ptwResp.pf
io.tlb.resp.bits.data.af := ptwResp.af
io.tlb.resp.bits.data.memidx := memidx(OHToUInt(ptwResp_OldMatchVec))
......@@ -323,6 +325,14 @@ class PTWFilter(Width: Int, Size: Int, FenceDelay: Int)(implicit p: Parameters)
inflight_counter := 0.U
}
val sourceVaddr = WireInit(0.U.asTypeOf(new Valid(UInt(VAddrBits.W))))
ExcitingUtils.addSink(sourceVaddr, s"rob_head_vaddr_${coreParams.HartId}", ExcitingUtils.Perf)
io.rob_head_miss_in_tlb := VecInit(v.zip(vpn).map{case (vi, vpni) => {
vi && sourceVaddr.valid && vpni === get_pn(sourceVaddr.bits)
}}).asUInt.orR
// perf
XSPerfAccumulate("tlb_req_count", PopCount(Cat(io.tlb.req.map(_.valid))))
XSPerfAccumulate("tlb_req_count_filtered", Mux(do_enq, accumEnqNum(Width - 1), 0.U))
......
......@@ -73,7 +73,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
// val vmEnable = satp.mode === 8.U // && (mode < ModeM) // FIXME: fix me when boot xv6/linux...
val vmEnable = if (EnbaleTlbDebug) (satp.mode === 8.U)
else (satp.mode === 8.U && (mode < ModeM))
val portTranslateEnable = (0 until Width).map(i => vmEnable && !req(i).bits.no_translate)
val portTranslateEnable = (0 until Width).map(i => vmEnable && RegNext(!req(i).bits.no_translate))
val req_in = req
val req_out = req.map(a => RegEnable(a.bits, a.fire()))
......
......@@ -98,7 +98,7 @@ class TLBFA(
io.r.req.map(_.ready := true.B)
val v = RegInit(VecInit(Seq.fill(nWays)(false.B)))
val entries = Reg(Vec(nWays, new TlbEntry(normalPage, superPage)))
val entries = Reg(Vec(nWays, new TlbSectorEntry(normalPage, superPage)))
val g = entries.map(_.perm.g)
for (i <- 0 until ports) {
......@@ -186,14 +186,21 @@ class TLBFA(
io.victim.out.valid := v(victim_idx) && io.w.valid && entries(victim_idx).is_normalentry()
io.victim.out.bits.entry := ns_to_n(entries(victim_idx))
def ns_to_n(ns: TlbEntry): TlbEntry = {
def ns_to_n(ns: TlbSectorEntry): TlbEntry = {
val n = Wire(new TlbEntry(pageNormal = true, pageSuper = false))
n.perm := ns.perm
n.ppn := ns.ppn
n.tag := ns.tag
n.perm.af := ns.perm.af
n.perm.pf := ns.perm.pf
n.perm.d := ns.perm.d
n.perm.a := ns.perm.a
n.perm.g := ns.perm.g
n.perm.u := ns.perm.u
n.perm.x := ns.perm.x
n.perm.w := ns.perm.w
n.perm.r := ns.perm.r
n.perm.pm := ns.perm.pm(OHToUInt(ns.pteidx))
n.ppn := Cat(ns.ppn, ns.ppn_low(OHToUInt(ns.pteidx)))
n.tag := Cat(ns.tag, OHToUInt(ns.pteidx))
n.asid := ns.asid
n.valididx := ns.valididx
n.ppn_low := ns.ppn_low
n
}
......@@ -249,10 +256,10 @@ class TLBSA(
val vpn = req.bits.vpn
val vpn_reg = RegEnable(vpn, req.fire())
val ridx = get_set_idx(vpn(vpn.getWidth - 1, sectortlbwidth), nSets)
val ridx = get_set_idx(vpn, nSets)
val v_resize = v.asTypeOf(Vec(VPRE_SELECT, Vec(VPOST_SELECT, UInt(nWays.W))))
val vidx_resize = RegNext(v_resize(get_set_idx(drop_set_idx(vpn(vpn.getWidth - 1, sectortlbwidth), VPOST_SELECT), VPRE_SELECT)))
val vidx = vidx_resize(get_set_idx(vpn_reg(vpn_reg.getWidth - 1, sectortlbwidth), VPOST_SELECT)).asBools.map(_ && RegNext(req.fire()))
val vidx_resize = RegNext(v_resize(get_set_idx(drop_set_idx(vpn, VPOST_SELECT), VPRE_SELECT)))
val vidx = vidx_resize(get_set_idx(vpn_reg, VPOST_SELECT)).asBools.map(_ && RegNext(req.fire()))
val vidx_bypass = RegNext((entries.io.waddr === ridx) && entries.io.wen)
entries.io.raddr(i) := ridx
......@@ -261,7 +268,18 @@ class TLBSA(
resp.bits.hit := hit
for (d <- 0 until nDups) {
resp.bits.ppn(d) := data(d).genPPN()(vpn_reg)
resp.bits.perm(d) := data(d).perm
resp.bits.perm(d).pf := data(d).perm.pf
resp.bits.perm(d).af := data(d).perm.af
resp.bits.perm(d).d := data(d).perm.d
resp.bits.perm(d).a := data(d).perm.a
resp.bits.perm(d).g := data(d).perm.g
resp.bits.perm(d).u := data(d).perm.u
resp.bits.perm(d).x := data(d).perm.x
resp.bits.perm(d).w := data(d).perm.w
resp.bits.perm(d).r := data(d).perm.r
for (i <- 0 until tlbcontiguous) {
resp.bits.perm(d).pm(i) := data(d).perm.pm
}
}
resp.valid := { RegNext(req.valid) }
......@@ -269,7 +287,7 @@ class TLBSA(
resp.bits.ppn.suggestName("ppn")
resp.bits.perm.suggestName("perm")
access.sets := get_set_idx(vpn_reg(vpn_reg.getWidth - 1, sectortlbwidth), nSets) // no use
access.sets := get_set_idx(vpn_reg, nSets) // no use
access.touch_ways.valid := resp.valid && hit
access.touch_ways.bits := 1.U // TODO: set-assoc need no replacer when nset is 1
}
......@@ -280,7 +298,7 @@ class TLBSA(
get_set_idx(io.w.bits.data.entry.tag, nSets),
get_set_idx(io.victim.in.bits.entry.tag, nSets))
entries.io.wdata := Mux(io.w.valid,
(Wire(new TlbEntry(normalPage, superPage)).apply(io.w.bits.data, io.csr.satp.asid, io.w.bits.data_replenish)),
(Wire(new TlbEntry(normalPage, superPage)).apply(io.w.bits.data, io.csr.satp.asid, io.w.bits.data_replenish(OHToUInt(io.w.bits.data.pteidx)))),
io.victim.in.bits.entry)
when (io.victim.in.valid) {
......@@ -303,13 +321,12 @@ class TLBSA(
val sfence = io.sfence
val sfence_vpn = sfence.bits.addr.asTypeOf(new VaBundle().cloneType).vpn
// Sfence will flush all sectors of an entry when hit
when (io.sfence.valid) {
when (sfence.bits.rs1) { // virtual address *.rs1 <- (rs1===0.U)
v.map(a => a.map(b => b := false.B))
}.otherwise {
// specific addr but all asid
v(get_set_idx(sfence_vpn(sfence_vpn.getWidth - 1, sectortlbwidth), nSets)).map(_ := false.B)
v(get_set_idx(sfence_vpn, nSets)).map(_ := false.B)
}
}
......@@ -327,7 +344,7 @@ class TLBSA(
for (i <- 0 until nSets) {
XSPerfAccumulate(s"hit${i}", io.r.resp.map(a => a.valid & a.bits.hit)
.zip(io.r.req.map(a => RegNext(get_set_idx(a.bits.vpn(a.bits.vpn.getWidth - 1, sectortlbwidth), nSets)) === i.U))
.zip(io.r.req.map(a => RegNext(get_set_idx(a.bits.vpn, nSets)) === i.U))
.map{a => (a._1 && a._2).asUInt()}
.fold(0.U)(_ + _)
)
......@@ -335,7 +352,7 @@ class TLBSA(
for (i <- 0 until nSets) {
XSPerfAccumulate(s"access${i}", io.r.resp.map(_.valid)
.zip(io.r.req.map(a => RegNext(get_set_idx(a.bits.vpn(a.bits.vpn.getWidth - 1, sectortlbwidth), nSets)) === i.U))
.zip(io.r.req.map(a => RegNext(get_set_idx(a.bits.vpn, nSets)) === i.U))
.map{a => (a._1 && a._2).asUInt()}
.fold(0.U)(_ + _)
)
......@@ -508,11 +525,20 @@ class TlbStorageWrapper(ports: Int, q: TLBParameters, nDups: Int = 1)(implicit p
rp.bits.hit := np.bits.hit || sp.bits.hit
for (d <- 0 until nDups) {
rp.bits.ppn(d) := Mux(sp.bits.hit, sp.bits.ppn(0), np.bits.ppn(d))
rp.bits.perm(d) := Mux(sp.bits.hit, sp.bits.perm(0), np.bits.perm(d))
rp.bits.perm(d).pf := Mux(sp.bits.hit, sp.bits.perm(0).pf, np.bits.perm(d).pf)
rp.bits.perm(d).af := Mux(sp.bits.hit, sp.bits.perm(0).af, np.bits.perm(d).af)
rp.bits.perm(d).d := Mux(sp.bits.hit, sp.bits.perm(0).d, np.bits.perm(d).d)
rp.bits.perm(d).a := Mux(sp.bits.hit, sp.bits.perm(0).a, np.bits.perm(d).a)
rp.bits.perm(d).g := Mux(sp.bits.hit, sp.bits.perm(0).g, np.bits.perm(d).g)
rp.bits.perm(d).u := Mux(sp.bits.hit, sp.bits.perm(0).u, np.bits.perm(d).u)
rp.bits.perm(d).x := Mux(sp.bits.hit, sp.bits.perm(0).x, np.bits.perm(d).x)
rp.bits.perm(d).w := Mux(sp.bits.hit, sp.bits.perm(0).w, np.bits.perm(d).w)
rp.bits.perm(d).r := Mux(sp.bits.hit, sp.bits.perm(0).r, np.bits.perm(d).r)
rp.bits.perm(d).pm := DontCare
}
rp.bits.super_hit := sp.bits.hit
rp.bits.super_ppn := sp.bits.ppn(0)
rp.bits.spm := np.bits.perm(0).pm(RegNext(io.r.req(i).bits.vpn(sectortlbwidth - 1, 0)))
rp.bits.spm := np.bits.perm(0).pm(0)
// Sector tlb may trigger multi-hit, see def "wbhit"
XSPerfAccumulate(s"port${i}_np_sp_multi_hit", !(!np.bits.hit || !sp.bits.hit || !rp.valid))
//assert(!np.bits.hit || !sp.bits.hit || !rp.valid, s"${q.name} storage ports${i} normal and super multi-hit")
......
......@@ -25,6 +25,7 @@ import utils._
import utility._
import scala.math.min
import xiangshan.backend.decode.ImmUnion
trait HasBPUConst extends HasXSParameter {
val MaxMetaLength = if (!env.FPGAPlatform) 512 else 256 // TODO: Reduce meta length
......@@ -244,6 +245,43 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H
val ctrl = DelayN(io.ctrl, 1)
val predictors = Module(if (useBPD) new Composer else new FakePredictor)
def numOfStage = 3
require(numOfStage > 1, "BPU numOfStage must be greater than 1")
val topdown_stages = RegInit(VecInit(Seq.fill(numOfStage)(0.U.asTypeOf(new FrontendTopDownBundle))))
dontTouch(topdown_stages)
// following can only happen on s1
val controlRedirectBubble = Wire(Bool())
val ControlBTBMissBubble = Wire(Bool())
val TAGEMissBubble = Wire(Bool())
val SCMissBubble = Wire(Bool())
val ITTAGEMissBubble = Wire(Bool())
val RASMissBubble = Wire(Bool())
val memVioRedirectBubble = Wire(Bool())
val otherRedirectBubble = Wire(Bool())
val btbMissBubble = Wire(Bool())
otherRedirectBubble := false.B
memVioRedirectBubble := false.B
// override can happen between s1-s2 and s2-s3
val overrideBubble = Wire(Vec(numOfStage - 1, Bool()))
def overrideStage = 1
// ftq update block can happen on s1, s2 and s3
val ftqUpdateBubble = Wire(Vec(numOfStage, Bool()))
def ftqUpdateStage = 0
// ftq full stall only happens on s3 (last stage)
val ftqFullStall = Wire(Bool())
// by default, no bubble event
topdown_stages(0) := 0.U.asTypeOf(new FrontendTopDownBundle)
// event movement driven by clock only
for (i <- 0 until numOfStage - 1) {
topdown_stages(i + 1) := topdown_stages(i)
}
// ctrl signal
predictors.io.ctrl := ctrl
predictors.io.reset_vector := io.reset_vector
......@@ -644,6 +682,74 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H
}
}
// Commit time history checker
if (EnableCommitGHistDiff) {
val commitGHist = RegInit(0.U.asTypeOf(Vec(HistoryLength, Bool())))
val commitGHistPtr = RegInit(0.U.asTypeOf(new CGHPtr))
def getCommitHist(ptr: CGHPtr): UInt =
(Cat(commitGHist.asUInt, commitGHist.asUInt) >> (ptr.value+1.U))(HistoryLength-1, 0)
val updateValid : Bool = io.ftq_to_bpu.update.valid
val branchValidMask : UInt = io.ftq_to_bpu.update.bits.ftb_entry.brValids.asUInt
val branchCommittedMask: Vec[Bool] = io.ftq_to_bpu.update.bits.br_committed
val misPredictMask : UInt = io.ftq_to_bpu.update.bits.mispred_mask.asUInt
val takenMask : UInt =
io.ftq_to_bpu.update.bits.br_taken_mask.asUInt |
io.ftq_to_bpu.update.bits.ftb_entry.always_taken.asUInt // Always taken branch is recorded in history
val takenIdx : UInt = (PriorityEncoder(takenMask) + 1.U((log2Ceil(numBr)+1).W)).asUInt
val misPredictIdx : UInt = (PriorityEncoder(misPredictMask) + 1.U((log2Ceil(numBr)+1).W)).asUInt
val shouldShiftMask: UInt = Mux(takenMask.orR,
LowerMask(takenIdx).asUInt,
((1 << numBr) - 1).asUInt) &
Mux(misPredictMask.orR,
LowerMask(misPredictIdx).asUInt,
((1 << numBr) - 1).asUInt) &
branchCommittedMask.asUInt
val updateShift : UInt =
Mux(updateValid && branchValidMask.orR, PopCount(branchValidMask & shouldShiftMask), 0.U)
dontTouch(updateShift)
dontTouch(commitGHist)
dontTouch(commitGHistPtr)
dontTouch(takenMask)
dontTouch(branchValidMask)
dontTouch(branchCommittedMask)
// Maintain the commitGHist
for (i <- 0 until numBr) {
when(updateShift >= (i + 1).U) {
val ptr: CGHPtr = commitGHistPtr - i.asUInt
commitGHist(ptr.value) := takenMask(i)
}
}
when(updateValid) {
commitGHistPtr := commitGHistPtr - updateShift
}
// Calculate true history using Parallel XOR
def computeFoldedHist(hist: UInt, compLen: Int)(histLen: Int): UInt = {
if (histLen > 0) {
val nChunks = (histLen + compLen - 1) / compLen
val hist_chunks = (0 until nChunks) map { i =>
hist(min((i + 1) * compLen, histLen) - 1, i * compLen)
}
ParallelXOR(hist_chunks)
}
else 0.U
}
// Do differential
val predictFHistAll: AllFoldedHistories = io.ftq_to_bpu.update.bits.spec_info.folded_hist
TageTableInfos.map {
case (nRows, histLen, _) => {
val nRowsPerBr = nRows / numBr
val commitTrueHist: UInt = computeFoldedHist(getCommitHist(commitGHistPtr), log2Ceil(nRowsPerBr))(histLen)
val predictFHist : UInt = predictFHistAll.
getHistWithInfo((histLen, min(histLen, log2Ceil(nRowsPerBr)))).folded_hist
XSWarn(updateValid && predictFHist =/= commitTrueHist,
p"predict time ghist: ${predictFHist} is different from commit time: ${commitTrueHist}\n")
}
}
}
// val updatedGh = oldGh.update(shift, taken && addIntoHist)
......@@ -677,6 +783,87 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H
}
}
// TODO: signals for memVio and other Redirects
controlRedirectBubble := do_redirect.valid && do_redirect.bits.ControlRedirectBubble
ControlBTBMissBubble := do_redirect.bits.ControlBTBMissBubble
TAGEMissBubble := do_redirect.bits.TAGEMissBubble
SCMissBubble := do_redirect.bits.SCMissBubble
ITTAGEMissBubble := do_redirect.bits.ITTAGEMissBubble
RASMissBubble := do_redirect.bits.RASMissBubble
memVioRedirectBubble := do_redirect.valid && do_redirect.bits.MemVioRedirectBubble
otherRedirectBubble := do_redirect.valid && do_redirect.bits.OtherRedirectBubble
btbMissBubble := do_redirect.valid && do_redirect.bits.BTBMissBubble
overrideBubble(0) := s2_redirect
overrideBubble(1) := s3_redirect
ftqUpdateBubble(0) := !s1_components_ready
ftqUpdateBubble(1) := !s2_components_ready
ftqUpdateBubble(2) := !s3_components_ready
ftqFullStall := !io.bpu_to_ftq.resp.ready
io.bpu_to_ftq.resp.bits.topdown_info := topdown_stages(numOfStage - 1)
// topdown handling logic here
when (controlRedirectBubble) {
/*
for (i <- 0 until numOfStage)
topdown_stages(i).reasons(TopDownCounters.ControlRedirectBubble.id) := true.B
io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.ControlRedirectBubble.id) := true.B
*/
when (ControlBTBMissBubble) {
for (i <- 0 until numOfStage)
topdown_stages(i).reasons(TopDownCounters.BTBMissBubble.id) := true.B
io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.BTBMissBubble.id) := true.B
} .elsewhen (TAGEMissBubble) {
for (i <- 0 until numOfStage)
topdown_stages(i).reasons(TopDownCounters.TAGEMissBubble.id) := true.B
io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.TAGEMissBubble.id) := true.B
} .elsewhen (SCMissBubble) {
for (i <- 0 until numOfStage)
topdown_stages(i).reasons(TopDownCounters.SCMissBubble.id) := true.B
io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.SCMissBubble.id) := true.B
} .elsewhen (ITTAGEMissBubble) {
for (i <- 0 until numOfStage)
topdown_stages(i).reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
} .elsewhen (RASMissBubble) {
for (i <- 0 until numOfStage)
topdown_stages(i).reasons(TopDownCounters.RASMissBubble.id) := true.B
io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.RASMissBubble.id) := true.B
}
}
when (memVioRedirectBubble) {
for (i <- 0 until numOfStage)
topdown_stages(i).reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
}
when (otherRedirectBubble) {
for (i <- 0 until numOfStage)
topdown_stages(i).reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
}
when (btbMissBubble) {
for (i <- 0 until numOfStage)
topdown_stages(i).reasons(TopDownCounters.BTBMissBubble.id) := true.B
io.bpu_to_ftq.resp.bits.topdown_info.reasons(TopDownCounters.BTBMissBubble.id) := true.B
}
for (i <- 0 until numOfStage) {
if (i < numOfStage - overrideStage) {
when (overrideBubble(i)) {
for (j <- 0 to i)
topdown_stages(j).reasons(TopDownCounters.OverrideBubble.id) := true.B
}
}
if (i < numOfStage - ftqUpdateStage) {
when (ftqUpdateBubble(i)) {
topdown_stages(i).reasons(TopDownCounters.FtqUpdateBubble.id) := true.B
}
}
}
when (ftqFullStall) {
topdown_stages(0).reasons(TopDownCounters.FtqFullStall.id) := true.B
}
XSError(isBefore(redirect.cfiUpdate.histPtr, s3_ghist_ptr) && do_redirect.valid, p"s3_ghist_ptr ${s3_ghist_ptr} exceeds redirect histPtr ${redirect.cfiUpdate.histPtr}\n")
XSError(isBefore(redirect.cfiUpdate.histPtr, s2_ghist_ptr) && do_redirect.valid, p"s2_ghist_ptr ${s2_ghist_ptr} exceeds redirect histPtr ${redirect.cfiUpdate.histPtr}\n")
XSError(isBefore(redirect.cfiUpdate.histPtr, s1_ghist_ptr) && do_redirect.valid, p"s1_ghist_ptr ${s1_ghist_ptr} exceeds redirect histPtr ${redirect.cfiUpdate.histPtr}\n")
......
......@@ -56,6 +56,8 @@ class FtbSlot(val offsetLen: Int, val subOffsetLen: Option[Int] = None)(implicit
val sharing = Bool()
val valid = Bool()
val sc = Bool() // set by sc in s3, perf use only
def setLowerStatByTarget(pc: UInt, target: UInt, isShare: Boolean) = {
def getTargetStatByHigher(pc_higher: UInt, target_higher: UInt) =
Mux(target_higher > pc_higher, TAR_OVF,
......
......@@ -44,7 +44,7 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
val hartId = Input(UInt(8.W))
val reset_vector = Input(UInt(PAddrBits.W))
val fencei = Input(Bool())
val ptw = new VectorTlbPtwIO(4)
val ptw = new VectorTlbPtwIO(coreParams.itlbPortNum)
val backend = new FrontendToCtrlIO
val sfence = Input(new SfenceBundle)
val tlbCsr = Input(new TlbCsrBundle)
......@@ -69,6 +69,13 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
val ftq = Module(new Ftq)
val needFlush = RegNext(io.backend.toFtq.redirect.valid)
val FlushControlRedirect = RegNext(io.backend.toFtq.redirect.bits.debugIsCtrl)
val FlushMemVioRedirect = RegNext(io.backend.toFtq.redirect.bits.debugIsMemVio)
val FlushControlBTBMiss = Wire(Bool())
val FlushTAGEMiss = Wire(Bool())
val FlushSCMiss = Wire(Bool())
val FlushITTAGEMiss = Wire(Bool())
val FlushRASMiss = Wire(Bool())
val tlbCsr = DelayN(io.tlbCsr, 2)
val csrCtrl = DelayN(io.csrCtrl, 2)
......@@ -84,26 +91,24 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
bpu.io.reset_vector := io.reset_vector
// pmp
val prefetchPipeNum = ICacheParameters().prefetchPipeNum
val pmp = Module(new PMP())
val pmp_check = VecInit(Seq.fill(4)(Module(new PMPChecker(3, sameCycle = true)).io))
val pmp_check = VecInit(Seq.fill(coreParams.ipmpPortNum)(Module(new PMPChecker(3, sameCycle = true)).io))
pmp.io.distribute_csr := csrCtrl.distribute_csr
val pmp_req_vec = Wire(Vec(4, Valid(new PMPReqBundle())))
pmp_req_vec(0) <> icache.io.pmp(0).req
pmp_req_vec(1) <> icache.io.pmp(1).req
pmp_req_vec(2) <> icache.io.pmp(2).req
pmp_req_vec(3) <> ifu.io.pmp.req
val pmp_req_vec = Wire(Vec(coreParams.ipmpPortNum, Valid(new PMPReqBundle())))
(0 until 2 + prefetchPipeNum).foreach(i => pmp_req_vec(i) <> icache.io.pmp(i).req)
pmp_req_vec.last <> ifu.io.pmp.req
for (i <- pmp_check.indices) {
pmp_check(i).apply(tlbCsr.priv.imode, pmp.io.pmp, pmp.io.pma, pmp_req_vec(i))
}
icache.io.pmp(0).resp <> pmp_check(0).resp
icache.io.pmp(1).resp <> pmp_check(1).resp
icache.io.pmp(2).resp <> pmp_check(2).resp
ifu.io.pmp.resp <> pmp_check(3).resp
val itlb = Module(new TLB(4, nRespDups = 1, Seq(true, true, false, true), itlbParams))
itlb.io.requestor.take(3) zip icache.io.itlb foreach {case (a,b) => a <> b}
itlb.io.requestor(3) <> ifu.io.iTLBInter // mmio may need re-tlb, blocked
(0 until 2 + prefetchPipeNum).foreach(i => icache.io.pmp(i).resp <> pmp_check(i).resp)
ifu.io.pmp.resp <> pmp_check.last.resp
val itlb = Module(new TLB(coreParams.itlbPortNum, nRespDups = 1,
Seq(true, true) ++ Seq.fill(prefetchPipeNum)(false) ++ Seq(true), itlbParams))
itlb.io.requestor.take(2 + prefetchPipeNum) zip icache.io.itlb foreach {case (a,b) => a <> b}
itlb.io.requestor.last <> ifu.io.iTLBInter // mmio may need re-tlb, blocked
itlb.io.base_connect(io.sfence, tlbCsr)
io.ptw.connect(itlb.io.ptw)
itlb.io.ptw_replenish <> DontCare
......@@ -128,6 +133,8 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
ifu.io.icacheInter.resp <> icache.io.fetch.resp
ifu.io.icacheInter.icacheReady := icache.io.toIFU
ifu.io.icacheInter.topdownIcacheMiss := icache.io.fetch.topdownIcacheMiss
ifu.io.icacheInter.topdownItlbMiss := icache.io.fetch.topdownItlbMiss
icache.io.stop := ifu.io.icacheStop
ifu.io.icachePerfInfo := icache.io.perfInfo
......@@ -138,6 +145,8 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
icache.io.csr_pf_enable := RegNext(csrCtrl.l1I_pf_enable)
icache.io.csr_parity_enable := RegNext(csrCtrl.icache_parity_enable)
icache.io.fencei := io.fencei
//IFU-Ibuffer
ifu.io.toIbuffer <> ibuffer.io.in
......@@ -148,7 +157,23 @@ class FrontendImp (outer: Frontend) extends LazyModuleImp(outer)
ifu.io.rob_commits <> io.backend.toFtq.rob_commits
ibuffer.io.flush := needFlush
ibuffer.io.ControlRedirect := FlushControlRedirect
ibuffer.io.MemVioRedirect := FlushMemVioRedirect
ibuffer.io.ControlBTBMissBubble := FlushControlBTBMiss
ibuffer.io.TAGEMissBubble := FlushTAGEMiss
ibuffer.io.SCMissBubble := FlushSCMiss
ibuffer.io.ITTAGEMissBubble := FlushITTAGEMiss
ibuffer.io.RASMissBubble := FlushRASMiss
FlushControlBTBMiss := ftq.io.ControlBTBMissBubble
FlushTAGEMiss := ftq.io.TAGEMissBubble
FlushSCMiss := ftq.io.SCMissBubble
FlushITTAGEMiss := ftq.io.ITTAGEMissBubble
FlushRASMiss := ftq.io.RASMissBubble
io.backend.cfVec <> ibuffer.io.out
io.backend.stallReason <> ibuffer.io.stallReason
dontTouch(io.backend.stallReason)
instrUncache.io.req <> ifu.io.uncacheInter.toUncache
ifu.io.uncacheInter.fromUncache <> instrUncache.io.resp
......
......@@ -24,6 +24,12 @@ import xiangshan.frontend.icache._
import utils._
import utility._
import scala.math._
import java.util.ResourceBundle.Control
class FrontendTopDownBundle(implicit p: Parameters) extends XSBundle {
val reasons = Vec(TopDownCounters.NumStallReasons.id, Bool())
val stallWidth = UInt(log2Ceil(PredictWidth).W)
}
@chiselName
class FetchRequestBundle(implicit p: Parameters) extends XSBundle with HasICacheParameters {
......@@ -36,6 +42,8 @@ class FetchRequestBundle(implicit p: Parameters) extends XSBundle with HasICache
val ftqIdx = new FtqPtr
val ftqOffset = ValidUndirectioned(UInt(log2Ceil(PredictWidth).W))
val topdown_info = new FrontendTopDownBundle
def crossCacheline = startAddr(blockOffBits - 1) === 1.U
def fromFtqPcBundle(b: Ftq_RF_Components) = {
......@@ -74,6 +82,8 @@ class FtqICacheInfo(implicit p: Parameters)extends XSBundle with HasICacheParame
class IFUICacheIO(implicit p: Parameters)extends XSBundle with HasICacheParameters{
val icacheReady = Output(Bool())
val resp = Vec(PortNumber, ValidIO(new ICacheMainPipeResp))
val topdownIcacheMiss = Output(Bool())
val topdownItlbMiss = Output(Bool())
}
class FtqToICacheRequestBundle(implicit p: Parameters)extends XSBundle with HasICacheParameters{
......@@ -121,6 +131,8 @@ class FetchToIBuffer(implicit p: Parameters) extends XSBundle {
val acf = Vec(PredictWidth, Bool())
val crossPageIPFFix = Vec(PredictWidth, Bool())
val triggered = Vec(PredictWidth, new TriggerCf)
val topdown_info = new FrontendTopDownBundle
}
// class BitWiseUInt(val width: Int, val init: UInt) extends Module {
......@@ -569,6 +581,8 @@ class BranchPredictionResp(implicit p: Parameters) extends XSBundle with HasBPUC
val last_stage_spec_info = new SpeculativeInfo
val last_stage_ftb_entry = new FTBEntry
val topdown_info = new FrontendTopDownBundle
def selectedResp ={
val res =
PriorityMux(Seq(
......@@ -596,6 +610,7 @@ class BranchPredictionUpdate(implicit p: Parameters) extends XSBundle with HasBP
val cfi_idx = ValidUndirectioned(UInt(log2Ceil(PredictWidth).W))
val br_taken_mask = Vec(numBr, Bool())
val br_committed = Vec(numBr, Bool()) // High only when br valid && br committed
val jmp_taken = Bool()
val mispred_mask = Vec(numBr+1, Bool())
val pred_hit = Bool()
......@@ -638,6 +653,28 @@ class BranchPredictionRedirect(implicit p: Parameters) extends Redirect with Has
// }
// TODO: backend should pass topdown signals here
// must not change its parent since BPU has used asTypeOf(this type) from its parent class
require(isInstanceOf[Redirect])
val BTBMissBubble = Bool()
def ControlRedirectBubble = debugIsCtrl
// if mispred br not in ftb, count as BTB miss
def ControlBTBMissBubble = ControlRedirectBubble && !cfiUpdate.br_hit && !cfiUpdate.jr_hit
def TAGEMissBubble = ControlRedirectBubble && cfiUpdate.br_hit && !cfiUpdate.sc_hit
def SCMissBubble = ControlRedirectBubble && cfiUpdate.br_hit && cfiUpdate.sc_hit
def ITTAGEMissBubble = ControlRedirectBubble && cfiUpdate.jr_hit && !cfiUpdate.pd.isRet
def RASMissBubble = ControlRedirectBubble && cfiUpdate.jr_hit && cfiUpdate.pd.isRet
def MemVioRedirectBubble = debugIsMemVio
def OtherRedirectBubble = !debugIsCtrl && !debugIsMemVio
def connectRedirect(source: Redirect): Unit = {
for ((name, data) <- this.elements) {
if (source.elements.contains(name)) {
data := source.elements(name)
}
}
}
def display(cond: Bool): Unit = {
XSDebug(cond, p"-----------BranchPredictionRedirect----------- \n")
XSDebug(cond, p"-----------cfiUpdate----------- \n")
......
......@@ -133,6 +133,74 @@ class NewIFU(implicit p: Parameters) extends XSModule
def isLastInCacheline(addr: UInt): Bool = addr(blockOffBits - 1, 1) === 0.U
def numOfStage = 3
require(numOfStage > 1, "BPU numOfStage must be greater than 1")
val topdown_stages = RegInit(VecInit(Seq.fill(numOfStage)(0.U.asTypeOf(new FrontendTopDownBundle))))
dontTouch(topdown_stages)
// bubble events in IFU, only happen in stage 1
val icacheMissBubble = Wire(Bool())
val itlbMissBubble =Wire(Bool())
// only driven by clock, not valid-ready
topdown_stages(0) := fromFtq.req.bits.topdown_info
for (i <- 1 until numOfStage) {
topdown_stages(i) := topdown_stages(i - 1)
}
when (icacheMissBubble) {
topdown_stages(1).reasons(TopDownCounters.ICacheMissBubble.id) := true.B
}
when (itlbMissBubble) {
topdown_stages(1).reasons(TopDownCounters.ITLBMissBubble.id) := true.B
}
io.toIbuffer.bits.topdown_info := topdown_stages(numOfStage - 1)
when (fromFtq.topdown_redirect.valid) {
// only redirect from backend, IFU redirect itself is handled elsewhere
when (fromFtq.topdown_redirect.bits.debugIsCtrl) {
/*
for (i <- 0 until numOfStage) {
topdown_stages(i).reasons(TopDownCounters.ControlRedirectBubble.id) := true.B
}
io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.ControlRedirectBubble.id) := true.B
*/
when (fromFtq.topdown_redirect.bits.ControlBTBMissBubble) {
for (i <- 0 until numOfStage) {
topdown_stages(i).reasons(TopDownCounters.BTBMissBubble.id) := true.B
}
io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.BTBMissBubble.id) := true.B
} .elsewhen (fromFtq.topdown_redirect.bits.TAGEMissBubble) {
for (i <- 0 until numOfStage) {
topdown_stages(i).reasons(TopDownCounters.TAGEMissBubble.id) := true.B
}
io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.TAGEMissBubble.id) := true.B
} .elsewhen (fromFtq.topdown_redirect.bits.SCMissBubble) {
for (i <- 0 until numOfStage) {
topdown_stages(i).reasons(TopDownCounters.SCMissBubble.id) := true.B
}
io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.SCMissBubble.id) := true.B
} .elsewhen (fromFtq.topdown_redirect.bits.ITTAGEMissBubble) {
for (i <- 0 until numOfStage) {
topdown_stages(i).reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
}
io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
} .elsewhen (fromFtq.topdown_redirect.bits.RASMissBubble) {
for (i <- 0 until numOfStage) {
topdown_stages(i).reasons(TopDownCounters.RASMissBubble.id) := true.B
}
io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.RASMissBubble.id) := true.B
}
} .elsewhen (fromFtq.topdown_redirect.bits.debugIsMemVio) {
for (i <- 0 until numOfStage) {
topdown_stages(i).reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
}
io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
} .otherwise {
for (i <- 0 until numOfStage) {
topdown_stages(i).reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
}
io.toIbuffer.bits.topdown_info.reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
}
}
class TlbExept(implicit p: Parameters) extends XSBundle{
val pageFault = Bool()
val accessFault = Bool()
......@@ -180,6 +248,16 @@ class NewIFU(implicit p: Parameters) extends XSModule
fromFtq.req.ready := f1_ready && io.icacheInter.icacheReady
when (wb_redirect) {
when (f3_wb_not_flush) {
topdown_stages(2).reasons(TopDownCounters.BTBMissBubble.id) := true.B
}
for (i <- 0 until numOfStage - 1) {
topdown_stages(i).reasons(TopDownCounters.BTBMissBubble.id) := true.B
}
}
/** <PERF> f0 fetch bubble */
XSPerfAccumulate("fetch_bubble_ftq_not_valid", !fromFtq.req.valid && fromFtq.req.ready )
......@@ -247,6 +325,9 @@ class NewIFU(implicit p: Parameters) extends XSModule
icacheRespAllValid := f2_icache_all_resp_reg || f2_icache_all_resp_wire
icacheMissBubble := io.icacheInter.topdownIcacheMiss
itlbMissBubble := io.icacheInter.topdownItlbMiss
io.icacheStop := !f3_ready
when(f2_flush) {f2_icache_all_resp_reg := false.B}
......@@ -389,7 +470,7 @@ class NewIFU(implicit p: Parameters) extends XSModule
val f3_af_vec = RegEnable(next = f2_af_vec, enable = f2_fire)
val f3_pf_vec = RegEnable(next = f2_pf_vec , enable = f2_fire)
val f3_pc = RegEnable(next = f2_pc, enable = f2_fire)
val f3_half_snpc = RegEnable(next = f2_half_snpc, enable = f2_fire)
val f3_half_snpc = RegEnable(next = f2_half_snpc, enable = f2_fire)
val f3_instr_range = RegEnable(next = f2_instr_range, enable = f2_fire)
val f3_foldpc = RegEnable(next = f2_foldpc, enable = f2_fire)
val f3_crossPageFault = RegEnable(next = f2_crossPageFault, enable = f2_fire)
......@@ -582,7 +663,7 @@ class NewIFU(implicit p: Parameters) extends XSModule
!f3_pd(idx).isRVC && checkerOutStage1.fixedRange(idx) && f3_instr_valid(idx) && !checkerOutStage1.fixedTaken(idx) && ! f3_req_is_mmio
}
val f3_last_validIdx = ~ParallelPriorityEncoder(checkerOutStage1.fixedRange.reverse)
val f3_last_validIdx = ParallelPosteriorityEncoder(checkerOutStage1.fixedRange)
val f3_hasLastHalf = hasLastHalf((PredictWidth - 1).U)
val f3_false_lastHalf = hasLastHalf(f3_last_validIdx)
......@@ -746,6 +827,8 @@ class NewIFU(implicit p: Parameters) extends XSModule
}
val checkFlushWb = Wire(Valid(new PredecodeWritebackBundle))
val checkFlushWbjalTargetIdx = ParallelPriorityEncoder(VecInit(wb_pd.zip(wb_instr_valid).map{case (pd, v) => v && pd.isJal }))
val checkFlushWbTargetIdx = ParallelPriorityEncoder(wb_check_result_stage2.fixedMissPred)
checkFlushWb.valid := wb_valid
checkFlushWb.bits.pc := wb_pc
checkFlushWb.bits.pd := wb_pd
......@@ -756,8 +839,8 @@ class NewIFU(implicit p: Parameters) extends XSModule
checkFlushWb.bits.misOffset.bits := Mux(wb_half_flush, wb_lastIdx, ParallelPriorityEncoder(wb_check_result_stage2.fixedMissPred))
checkFlushWb.bits.cfiOffset.valid := ParallelOR(wb_check_result_stage1.fixedTaken)
checkFlushWb.bits.cfiOffset.bits := ParallelPriorityEncoder(wb_check_result_stage1.fixedTaken)
checkFlushWb.bits.target := Mux(wb_half_flush, wb_half_target, wb_check_result_stage2.fixedTarget(ParallelPriorityEncoder(wb_check_result_stage2.fixedMissPred)))
checkFlushWb.bits.jalTarget := wb_check_result_stage2.fixedTarget(ParallelPriorityEncoder(VecInit(wb_pd.zip(wb_instr_valid).map{case (pd, v) => v && pd.isJal })))
checkFlushWb.bits.target := Mux(wb_half_flush, wb_half_target, wb_check_result_stage2.fixedTarget(checkFlushWbTargetIdx))
checkFlushWb.bits.jalTarget := wb_check_result_stage2.fixedTarget(checkFlushWbjalTargetIdx)
checkFlushWb.bits.instrRange := wb_instr_range.asTypeOf(Vec(PredictWidth, Bool()))
toFtq.pdWb := Mux(wb_valid, checkFlushWb, mmioFlushWb)
......@@ -827,6 +910,8 @@ class NewIFU(implicit p: Parameters) extends XSModule
XSPerfAccumulate("except_0", f3_perf_info.except_0 && io.toIbuffer.fire() )
XSPerfHistogram("ifu2ibuffer_validCnt", PopCount(io.toIbuffer.bits.valid & io.toIbuffer.bits.enqEnable), io.toIbuffer.fire, 0, PredictWidth + 1, 1)
val isWriteFetchToIBufferTable = WireInit(Constantin.createRecord("isWriteFetchToIBufferTable" + p(XSCoreParamsKey).HartId.toString))
val isWriteIfuWbToFtqTable = WireInit(Constantin.createRecord("isWriteIfuWbToFtqTable" + p(XSCoreParamsKey).HartId.toString))
val fetchToIBufferTable = ChiselDB.createTable("FetchToIBuffer" + p(XSCoreParamsKey).HartId.toString, new FetchToIBufferDB)
val ifuWbToFtqTable = ChiselDB.createTable("IfuWbToFtq" + p(XSCoreParamsKey).HartId.toString, new IfuWbToFtqDB)
......@@ -848,14 +933,14 @@ class NewIFU(implicit p: Parameters) extends XSModule
fetchToIBufferTable.log(
data = fetchIBufferDumpData,
en = io.toIbuffer.fire(),
en = isWriteFetchToIBufferTable.orR && io.toIbuffer.fire,
site = "IFU" + p(XSCoreParamsKey).HartId.toString,
clock = clock,
reset = reset
)
ifuWbToFtqTable.log(
data = ifuWbToFtqDumpData,
en = checkFlushWb.valid,
en = isWriteIfuWbToFtqTable.orR && checkFlushWb.valid,
site = "IFU" + p(XSCoreParamsKey).HartId.toString,
clock = clock,
reset = reset
......
......@@ -273,11 +273,10 @@ class ITTageTable
us.io.waddr := update_idx
us.io.wdata := io.update.u
val wrbypass = Module(new WrBypass(UInt(ITTageCtrBits.W), wrBypassEntries, log2Ceil(nRows), tagWidth=tagLen))
val wrbypass = Module(new WrBypass(UInt(ITTageCtrBits.W), wrBypassEntries, log2Ceil(nRows)))
wrbypass.io.wen := io.update.valid
wrbypass.io.write_idx := update_idx
wrbypass.io.write_tag.map(_ := update_tag)
wrbypass.io.write_data.map(_ := update_wdata.ctr)
val old_ctr = Mux(wrbypass.io.hit, wrbypass.io.hit_data(0).bits, io.update.oldCtr)
......@@ -420,7 +419,7 @@ class ITTage(implicit p: Parameters) extends BaseITTage {
val update = io.update.bits
val updateValid =
update.is_jalr && !update.is_ret && u_valid && update.ftb_entry.jmpValid &&
update.jmp_taken
update.jmp_taken && update.cfi_idx.valid && update.cfi_idx.bits === update.ftb_entry.tailSlot.offset
val updateFhist = update.spec_info.folded_hist
// meta is splited by composer
......@@ -470,12 +469,12 @@ class ITTage(implicit p: Parameters) extends BaseITTage {
s2_tageTaken := Mux1H(Seq(
(provided && !providerNull, providerInfo.ctr(ITTageCtrBits-1)),
(altProvided && providerNull, altProviderInfo.ctr(ITTageCtrBits-1)),
(!provided, basePred)
(!provided || providerNull && !altProvided, basePred)
)) // TODO: reintroduce BIM
s2_tageTarget := Mux1H(Seq(
(provided && !providerNull, providerInfo.target),
(altProvided && providerNull, altProviderInfo.target),
(!provided, baseTarget)
(!provided || providerNull && !altProvided, baseTarget)
))
s2_finalAltPred := Mux(altProvided, altProviderInfo.ctr(ITTageCtrBits-1), basePred)
s2_provided := provided
......
......@@ -31,9 +31,17 @@ class IbufPtr(implicit p: Parameters) extends CircularQueuePtr[IbufPtr](
class IBufferIO(implicit p: Parameters) extends XSBundle {
val flush = Input(Bool())
val ControlRedirect = Input(Bool())
val ControlBTBMissBubble = Input(Bool())
val TAGEMissBubble = Input(Bool())
val SCMissBubble = Input(Bool())
val ITTAGEMissBubble = Input(Bool())
val RASMissBubble = Input(Bool())
val MemVioRedirect = Input(Bool())
val in = Flipped(DecoupledIO(new FetchToIBuffer))
val out = Vec(DecodeWidth, DecoupledIO(new CtrlFlow))
val full = Output(Bool())
val stallReason = new StallReasonIO(DecodeWidth)
}
class IBufEntry(implicit p: Parameters) extends XSBundle {
......@@ -89,6 +97,38 @@ class IBufEntry(implicit p: Parameters) extends XSBundle {
class Ibuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper with HasPerfEvents {
val io = IO(new IBufferIO)
dontTouch(io.stallReason)
val topdown_stage = RegInit(0.U.asTypeOf(new FrontendTopDownBundle))
dontTouch(topdown_stage)
topdown_stage := io.in.bits.topdown_info
when (io.flush) {
when (io.ControlRedirect) {
when (io.ControlBTBMissBubble) {
topdown_stage.reasons(TopDownCounters.BTBMissBubble.id) := true.B
} .elsewhen (io.TAGEMissBubble) {
topdown_stage.reasons(TopDownCounters.TAGEMissBubble.id) := true.B
} .elsewhen (io.SCMissBubble) {
topdown_stage.reasons(TopDownCounters.SCMissBubble.id) := true.B
} .elsewhen (io.ITTAGEMissBubble) {
topdown_stage.reasons(TopDownCounters.ITTAGEMissBubble.id) := true.B
} .elsewhen (io.RASMissBubble) {
topdown_stage.reasons(TopDownCounters.RASMissBubble.id) := true.B
}
} .elsewhen (io.MemVioRedirect) {
topdown_stage.reasons(TopDownCounters.MemVioRedirectBubble.id) := true.B
} .otherwise {
topdown_stage.reasons(TopDownCounters.OtherRedirectBubble.id) := true.B
}
}
val dequeueInsufficient = Wire(Bool())
val matchBubble = Wire(UInt(log2Up(TopDownCounters.NumStallReasons.id).W))
matchBubble := (TopDownCounters.NumStallReasons.id - 1).U - PriorityEncoder(topdown_stage.reasons.reverse)
dontTouch(matchBubble)
val matchBubbleVec = WireInit(VecInit(topdown_stage.reasons.zipWithIndex.map{case (r, i) => matchBubble === i.U}))
val ibuf = Module(new SyncDataModuleTemplate(new IBufEntry, IBufSize, 2 * DecodeWidth, PredictWidth))
......@@ -132,6 +172,32 @@ class Ibuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrH
((1 << DecodeWidth) - 1).U,
UIntToMask(validEntries(log2Ceil(DecodeWidth) - 1, 0), DecodeWidth)
)
val deqValidCount = PopCount(validVec.asBools)
val deqWasteCount = DecodeWidth.U - deqValidCount
dequeueInsufficient := deqValidCount < DecodeWidth.U
io.stallReason.reason.map(_ := 0.U)
for (i <- 0 until DecodeWidth) {
when (i.U < deqWasteCount) {
io.stallReason.reason(DecodeWidth - i - 1) := matchBubble
}
}
when (!(deqWasteCount === DecodeWidth.U || topdown_stage.reasons.asUInt.orR)) {
// should set reason for FetchFragmentationStall
// topdown_stage.reasons(TopDownCounters.FetchFragmentationStall.id) := true.B
for (i <- 0 until DecodeWidth) {
when (i.U < deqWasteCount) {
io.stallReason.reason(DecodeWidth - i - 1) := TopDownCounters.FetchFragBubble.id.U
}
}
}
when (io.stallReason.backReason.valid) {
io.stallReason.reason.map(_ := io.stallReason.backReason.bits)
}
val deqData = Reg(Vec(DecodeWidth, new IBufEntry))
for (i <- 0 until DecodeWidth) {
io.out(i).valid := validVec(i)
......@@ -203,10 +269,26 @@ class Ibuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrH
QueuePerf(IBufSize, validEntries, !allowEnq)
XSPerfAccumulate("flush", io.flush)
XSPerfAccumulate("hungry", instrHungry)
if (env.EnableTopDown) {
val ibuffer_IDWidth_hvButNotFull = afterInit && (validEntries =/= 0.U) && (validEntries < DecodeWidth.U) && !headBubble
XSPerfAccumulate("ibuffer_IDWidth_hvButNotFull", ibuffer_IDWidth_hvButNotFull)
}
val ibuffer_IDWidth_hvButNotFull = afterInit && (validEntries =/= 0.U) && (validEntries < DecodeWidth.U) && !headBubble
XSPerfAccumulate("ibuffer_IDWidth_hvButNotFull", ibuffer_IDWidth_hvButNotFull)
/*
XSPerfAccumulate("ICacheMissBubble", Mux(matchBubbleVec(TopDownCounters.ICacheMissBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("ITLBMissBubble", Mux(matchBubbleVec(TopDownCounters.ITLBMissBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("ControlRedirectBubble", Mux(matchBubbleVec(TopDownCounters.ControlRedirectBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("MemVioRedirectBubble", Mux(matchBubbleVec(TopDownCounters.MemVioRedirectBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("OtherRedirectBubble", Mux(matchBubbleVec(TopDownCounters.OtherRedirectBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("BTBMissBubble", Mux(matchBubbleVec(TopDownCounters.BTBMissBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("OverrideBubble", Mux(matchBubbleVec(TopDownCounters.OverrideBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("FtqUpdateBubble", Mux(matchBubbleVec(TopDownCounters.FtqUpdateBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("FtqFullStall", Mux(matchBubbleVec(TopDownCounters.FtqFullStall.id), deqWasteCount, 0.U))
XSPerfAccumulate("FetchFragmentBubble",
Mux(deqWasteCount === DecodeWidth.U || topdown_stage.reasons.asUInt.orR, 0.U, deqWasteCount))
XSPerfAccumulate("TAGEMissBubble", Mux(matchBubbleVec(TopDownCounters.TAGEMissBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("SCMissBubble", Mux(matchBubbleVec(TopDownCounters.SCMissBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("ITTAGEMissBubble", Mux(matchBubbleVec(TopDownCounters.ITTAGEMissBubble.id), deqWasteCount, 0.U))
XSPerfAccumulate("RASMissBubble", Mux(matchBubbleVec(TopDownCounters.RASMissBubble.id), deqWasteCount, 0.U))
*/
val perfEvents = Seq(
("IBuffer_Flushed ", io.flush ),
......
......@@ -291,6 +291,11 @@ trait HasSC extends HasSCParameter with HasPerfEvents { this: Tage =>
s2_tageTakens(w)
)
val s3_disagree = RegEnable(s2_disagree, io.s2_fire)
// FIXME: not portable
io.out.last_stage_ftb_entry.brSlots(0).sc := RegEnable(s2_disagree(0), io.s2_fire)
io.out.last_stage_ftb_entry.tailSlot.sc := RegEnable(s2_disagree(1), io.s2_fire)
scMeta.tageTakens(w) := RegEnable(s2_tageTakens(w), io.s2_fire)
scMeta.scUsed(w) := RegEnable(s2_provideds(w), io.s2_fire)
scMeta.scPreds(w) := RegEnable(s2_scPreds(s2_chooseBit), io.s2_fire)
......
......@@ -420,7 +420,7 @@ class TageTable
}
val bank_wrbypasses = Seq.fill(nBanks)(Seq.fill(numBr)(
Module(new WrBypass(UInt(TageCtrBits.W), perBankWrbypassEntries, 1, tagWidth=tagLen))
Module(new WrBypass(UInt(TageCtrBits.W), perBankWrbypassEntries, log2Ceil(bankSize)))
)) // let it corresponds to logical brIdx
for (b <- 0 until nBanks) {
......@@ -456,7 +456,6 @@ class TageTable
val br_pidx = get_phy_br_idx(update_unhashed_idx, li)
wrbypass.io.wen := io.update.mask(li) && update_req_bank_1h(b)
wrbypass.io.write_idx := get_bank_idx(update_idx)
wrbypass.io.write_tag.map(_ := update_tag)
wrbypass.io.write_data(0) := Mux1H(UIntToOH(br_pidx, numBr), per_bank_update_wdata(b)).ctr
}
}
......@@ -677,12 +676,12 @@ class Tage(implicit p: Parameters) extends BaseTage {
resp_meta.allocates(i) := RegEnable(allocatableSlots, io.s2_fire)
val s1_bimCtr = bt.io.s1_cnt(i)
s1_altUsed(i) := !provided || providerInfo.use_alt_on_unconf
s1_tageTakens(i) :=
Mux(!provided || providerInfo.use_alt_on_unconf,
Mux(s1_altUsed(i),
s1_bimCtr(1),
providerInfo.resp.ctr(TageCtrBits-1)
)
s1_altUsed(i) := !provided || providerInfo.use_alt_on_unconf
s1_finalAltPreds(i) := s1_bimCtr(1)
s1_basecnts(i) := s1_bimCtr
s1_useAltOnNa(i) := providerInfo.use_alt_on_unconf
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -18,7 +18,7 @@ import xiangshan.backend.decode.DecodeUnit
object DecodeMain extends App with HasRocketChipStageUtils {
override def main(args: Array[String]): Unit = {
val (config, firrtlOpts, firrtlComplier) = ArgParser.parse(args)
val (config, firrtlOpts, firrtlComplier, firtoolOpts) = ArgParser.parse(args)
// //val soc = DisableMonitors(p => LazyModule(new XSTop()(p)))(config)
// If Complex Params are needed, wrap it with a Top Module to do dirty works,
// and use "chisel3.aop.Select.collectDeep[ModuleWanted](WrapperModule){case a: ModuleWanted => a}.head.Params"
......
Subproject commit c83eac5e93a94b514f7aca26f1c58e3934471d3b
Subproject commit 3d812fec9936ccd584df7721aa8c2d02e932d325
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册