Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenXiangShan
nexus-am
提交
d19b76e3
N
nexus-am
项目概览
OpenXiangShan
/
nexus-am
9 个月 前同步成功
通知
0
Star
21
Fork
25
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
N
nexus-am
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
d19b76e3
编写于
4月 06, 2023
作者:
W
William Wang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
maprobe: fix linear_access asm to support 2 load/cycle
上级
ceb50bb6
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
68 addition
and
48 deletion
+68
-48
apps/maprobe/include/maprobe.h
apps/maprobe/include/maprobe.h
+2
-3
apps/maprobe/latency-test.c
apps/maprobe/latency-test.c
+57
-42
apps/maprobe/main.c
apps/maprobe/main.c
+9
-3
未找到文件。
apps/maprobe/include/maprobe.h
浏览文件 @
d19b76e3
...
...
@@ -86,9 +86,8 @@ extern float test_l1_store_bandwidth(uint64_t size, int iter, int to_csv);
extern
float
test_l1_store_wcb_bandwidth
(
uint64_t
size
,
int
iter
,
int
to_csv
);
// key parameter matrix generate
void
generate_linear_access_latency_matrix
();
void
generate_continuosly_access_latency_matrix
();
void
generate_pointer_tracing_latency_matrix
();
void
generate_linear_access_latency_matrix
(
uint64_t
step
);
void
generate_pointer_tracing_latency_matrix
(
uint64_t
step
);
void
generate_random_access_latency_matrix
();
void
generate_replacement_test_matrix
();
...
...
apps/maprobe/latency-test.c
浏览文件 @
d19b76e3
...
...
@@ -169,7 +169,6 @@ float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter,
num_access
+=
num_access
%
8
?
8
-
num_access
%
8
:
0
;
assert
(
num_access
>=
8
);
// prepare access offset
uint64_t
address_offset_0
=
0
;
register
uint64_t
address_offset_1
=
step
*
1
;
register
uint64_t
address_offset_2
=
step
*
2
;
register
uint64_t
address_offset_3
=
step
*
3
;
...
...
@@ -184,23 +183,52 @@ float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter,
for
(
int
i
=
0
;
i
<
iter
;
i
++
)
{
uint64_t
address
=
_PERF_TEST_ADDR_BASE
;
for
(
int
j
=
0
;
j
<
num_access
;
j
+=
8
)
{
register
uint64_t
access_addr_0
=
address
+
address_offset_0
;
register
uint64_t
access_addr_1
=
address
+
address_offset_1
;
register
uint64_t
access_addr_2
=
address
+
address_offset_2
;
register
uint64_t
access_addr_3
=
address
+
address_offset_3
;
register
uint64_t
access_addr_4
=
address
+
address_offset_4
;
register
uint64_t
access_addr_5
=
address
+
address_offset_5
;
register
uint64_t
access_addr_6
=
address
+
address_offset_6
;
register
uint64_t
access_addr_7
=
address
+
address_offset_7
;
__asm__
volatile
(
"mv a1, %[addr]
\n
"
"add a2, %[addr], %[offset1]
\n
"
"add a3, %[addr], %[offset2]
\n
"
"add a4, %[addr], %[offset3]
\n
"
"add a5, %[addr], %[offset4]
\n
"
"add t0, %[addr], %[offset5]
\n
"
"add t1, %[addr], %[offset6]
\n
"
"add t2, %[addr], %[offset7]
\n
"
"ld a0, 0(a1)
\n
"
"ld a0, 0(a2)
\n
"
"ld a0, 0(a3)
\n
"
"ld a0, 0(a4)
\n
"
"ld a0, 0(a5)
\n
"
"ld a0, 0(t0)
\n
"
"ld a0, 0(t1)
\n
"
"ld a0, 0(t2)
\n
"
::
[
offset1
]
"r"
(
address_offset_1
),
[
offset2
]
"r"
(
address_offset_2
),
[
offset3
]
"r"
(
address_offset_3
),
[
offset4
]
"r"
(
address_offset_4
),
[
offset5
]
"r"
(
address_offset_5
),
[
offset6
]
"r"
(
address_offset_6
),
[
offset7
]
"r"
(
address_offset_7
),
[
addr
]
"r"
(
address
)
:
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"t0"
,
"t1"
,
"t2"
,
"t3"
);
address
+=
address_offset_8
;
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_0
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_1
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_2
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_3
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_4
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_5
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_6
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_7
)
:
"a0"
);
// register uint64_t access_addr_0 = address + address_offset_0;
// register uint64_t access_addr_1 = address + address_offset_1;
// register uint64_t access_addr_2 = address + address_offset_2;
// register uint64_t access_addr_3 = address + address_offset_3;
// register uint64_t access_addr_4 = address + address_offset_4;
// register uint64_t access_addr_5 = address + address_offset_5;
// register uint64_t access_addr_6 = address + address_offset_6;
// register uint64_t access_addr_7 = address + address_offset_7;
// address += address_offset_8;
// __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0");
// __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0");
// __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0");
// __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0");
// __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0");
// __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0");
// __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0");
// __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0");
}
}
_perf_end_timer
();
...
...
@@ -364,8 +392,9 @@ void legacy_test_mem_throughput_same_set(uint64_t iter)
printf
(
"mem band width %f B/cycle (%d samples)
\n
"
,
(
float
)
iter
*
_PERF_CACHELINE_SIZE_BYTE
/
perf
.
cycle
,
iter
);
}
void
generate_linear_access_latency_matrix
()
void
generate_linear_access_latency_matrix
(
uint64_t
step
)
{
// step can be _PERF_CACHELINE_SIZE_BYTE or 8*BYTE
#define LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14
// LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB
DEFINE_FLOAT_RESULT_MATRIX
(
linear_access_latency
,
size_kb_pow2
,
LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB
,
iter
,
3
);
...
...
@@ -374,32 +403,17 @@ void generate_linear_access_latency_matrix()
for
(
int
i
=
0
;
i
<
LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB
;
i
++
)
{
int
warm_up_iter
=
i
<
6
?
4
:
1
;
int
test_iter
=
i
<
6
?
4
:
2
;
linear_access_latency_result_array
[
i
][
0
]
=
test_linear_access_latency
((
1
<<
i
)
*
KB
,
_PERF_CACHELINE_SIZE_BYTE
,
warm_up_iter
,
0
);
//warmup
linear_access_latency_result_array
[
i
][
1
]
=
test_linear_access_latency
((
1
<<
i
)
*
KB
,
_PERF_CACHELINE_SIZE_BYTE
,
test_iter
,
0
);
//test
linear_access_latency_result_array
[
i
][
2
]
=
test_linear_access_latency
((
1
<<
i
)
*
KB
,
_PERF_CACHELINE_SIZE_BYTE
,
test_iter
,
0
);
//test
linear_access_latency_result_array
[
i
][
0
]
=
test_linear_access_latency
((
1
<<
i
)
*
KB
,
step
,
warm_up_iter
,
0
);
//warmup
linear_access_latency_result_array
[
i
][
1
]
=
test_linear_access_latency
((
1
<<
i
)
*
KB
,
step
,
test_iter
,
0
);
//test
linear_access_latency_result_array
[
i
][
2
]
=
test_linear_access_latency
((
1
<<
i
)
*
KB
,
step
,
test_iter
,
0
);
//test
}
printf
(
"[test step %ld]
\n
"
,
step
);
print_float_result_matrix
(
&
linear_access_latency_matrix_meta
);
}
void
generate_continuosly_access_latency_matrix
()
{
#define CONTINUOUSLY_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14
// CONTINUOUSLY_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB
DEFINE_FLOAT_RESULT_MATRIX
(
continuosly_access_latency
,
size_kb_pow2
,
CONTINUOUSLY_ACCESS_MATRIX_SIZE_MAX_POW2_KB
,
iter
,
3
);
FOR
(
x
,
CONTINUOUSLY_ACCESS_MATRIX_SIZE_MAX_POW2_KB
)
{
continuosly_access_latency_row_array
[
x
]
=
x
;
}
FOR
(
x
,
3
)
{
continuosly_access_latency_column_array
[
x
]
=
x
;
}
for
(
int
i
=
0
;
i
<
CONTINUOUSLY_ACCESS_MATRIX_SIZE_MAX_POW2_KB
;
i
++
)
{
int
warm_up_iter
=
i
<
6
?
4
:
1
;
int
test_iter
=
i
<
6
?
4
:
2
;
continuosly_access_latency_result_array
[
i
][
0
]
=
test_linear_access_latency
((
1
<<
i
)
*
KB
,
8
*
BYTE
,
warm_up_iter
,
0
);
//warmup
continuosly_access_latency_result_array
[
i
][
1
]
=
test_linear_access_latency
((
1
<<
i
)
*
KB
,
8
*
BYTE
,
test_iter
,
0
);
//test
continuosly_access_latency_result_array
[
i
][
2
]
=
test_linear_access_latency
((
1
<<
i
)
*
KB
,
8
*
BYTE
,
test_iter
,
0
);
//test
}
print_float_result_matrix
(
&
continuosly_access_latency_matrix_meta
);
}
void
generate_pointer_tracing_latency_matrix
()
void
generate_pointer_tracing_latency_matrix
(
uint64_t
step
)
{
// step can be _PERF_CACHELINE_SIZE_BYTE or 8*BYTE
#define POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB 14
// POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB
DEFINE_FLOAT_RESULT_MATRIX
(
pointer_tracing_latency
,
size_kb_pow2
,
POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB
,
iter
,
3
);
...
...
@@ -408,10 +422,11 @@ void generate_pointer_tracing_latency_matrix()
for
(
int
i
=
0
;
i
<
POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB
;
i
++
)
{
int
warm_up_iter
=
i
<
6
?
4
:
1
;
int
test_iter
=
i
<
6
?
4
:
2
;
pointer_tracing_latency_result_array
[
i
][
0
]
=
test_pointer_tracing_latency
((
1
<<
i
)
*
KB
,
_PERF_CACHELINE_SIZE_BYTE
,
warm_up_iter
,
0
);
//warmup
pointer_tracing_latency_result_array
[
i
][
1
]
=
test_pointer_tracing_latency
((
1
<<
i
)
*
KB
,
_PERF_CACHELINE_SIZE_BYTE
,
test_iter
,
0
);
//test
pointer_tracing_latency_result_array
[
i
][
2
]
=
test_pointer_tracing_latency
((
1
<<
i
)
*
KB
,
_PERF_CACHELINE_SIZE_BYTE
,
test_iter
,
0
);
//test
pointer_tracing_latency_result_array
[
i
][
0
]
=
test_pointer_tracing_latency
((
1
<<
i
)
*
KB
,
step
,
warm_up_iter
,
0
);
//warmup
pointer_tracing_latency_result_array
[
i
][
1
]
=
test_pointer_tracing_latency
((
1
<<
i
)
*
KB
,
step
,
test_iter
,
0
);
//test
pointer_tracing_latency_result_array
[
i
][
2
]
=
test_pointer_tracing_latency
((
1
<<
i
)
*
KB
,
step
,
test_iter
,
0
);
//test
}
printf
(
"[test step %ld]
\n
"
,
step
);
print_float_result_matrix
(
&
pointer_tracing_latency_matrix_meta
);
}
...
...
apps/maprobe/main.c
浏览文件 @
d19b76e3
...
...
@@ -80,6 +80,11 @@ void typical_random_load_test_set()
void
typical_pointer_tracing_load_test_set
()
{
printf
(
"------------- pointer tracing load test set -------------
\n
"
);
printf
(
"dobule word by dobule word tracing:
\n
"
);
test_pointer_tracing_latency
(
_PERF_PAGE_SIZE_BYTE
,
8
*
BYTE
,
10
,
0
);
test_pointer_tracing_latency
(
_PERF_L1_SIZE_BYTE
/
2
,
8
*
BYTE
,
2
,
0
);
test_pointer_tracing_latency
(
_PERF_L1_SIZE_BYTE
,
8
*
BYTE
,
2
,
0
);
test_pointer_tracing_latency
(
_PERF_L2_SIZE_BYTE
/
2
,
8
*
BYTE
,
2
,
0
);
printf
(
"cacheline by cacheline tracing:
\n
"
);
test_pointer_tracing_latency
(
_PERF_PAGE_SIZE_BYTE
,
_PERF_CACHELINE_SIZE_BYTE
,
10
,
0
);
test_pointer_tracing_latency
(
_PERF_L1_SIZE_BYTE
/
2
,
_PERF_CACHELINE_SIZE_BYTE
,
2
,
0
);
...
...
@@ -231,9 +236,10 @@ int main()
{
latency_test_example
();
generate_linear_access_latency_matrix
();
generate_continuosly_access_latency_matrix
();
generate_pointer_tracing_latency_matrix
();
generate_linear_access_latency_matrix
(
8
*
BYTE
);
generate_linear_access_latency_matrix
(
_PERF_CACHELINE_SIZE_BYTE
);
generate_pointer_tracing_latency_matrix
(
8
*
BYTE
);
generate_pointer_tracing_latency_matrix
(
_PERF_CACHELINE_SIZE_BYTE
);
generate_random_access_latency_matrix
();
generate_replacement_test_matrix
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录