提交 fcdbdc06 编写于 作者: W William Wang

maprobe: add l2_l3_pressure_test & replacement_test

上级 d903857d
NAME = maprobe
SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c main.c
SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c replacement-test.c main.c
include $(AM_HOME)/Makefile.app
......@@ -91,7 +91,7 @@ float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv)
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, 8
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, _PERF_CACHELINE_SIZE_BYTE
);
}
_perf_g_total_samples += total_access;
......
......@@ -6,8 +6,8 @@ uint64_t _perf_g_total_samples = 0;
void _perf_start_timer()
{
#ifndef PERF_SIM
perf.cycle = csr_read(CSR_MCYCLE);
perf.instrcnt = csr_read(CSR_MINSTRET);
perf.cycle = csr_read(CSR_MCYCLE);
#endif
}
......
......@@ -31,13 +31,15 @@
#define _PERF_MEM_SIZE_BYTE (1024 * MB)
#define _PERF_L1_NUM_WAYS 4
#define _PERF_L1_NUM_SETS 256
#define _PERF_L2_NUM_WAYS 8
#define _PERF_L2_NUM_SLICES 4
// #define _PERF_L2_NUM_SETS 512
#define _PERF_L2_NUM_SETS 512
#define _PERF_ADDR_STRIDE_L1_SAME_BANK _PERF_CACHELINE_SIZE_BYTE
#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_L2_SAME_SLICE (_PERF_L2_NUM_SLICES * _PERF_CACHELINE_SIZE_BYTE)
// #define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SLICES * _PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_NEXT_PAGE (_PERF_PAGE_SIZE_BYTE)
// probe const
......@@ -70,6 +72,8 @@ extern uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_v
extern void latency_test_warmup(uint64_t base_addr, uint64_t end_addr);
extern float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv);
extern float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv);
extern float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv);
extern float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv);
extern float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv);
extern float test_same_address_load_latency(int iter, int to_csv);
extern float test_read_after_write_latency(int iter, int to_csv);
......@@ -85,6 +89,7 @@ extern float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv);
void generate_linear_access_latency_matrix();
void generate_pointer_tracing_latency_matrix();
void generate_random_access_latency_matrix();
void generate_replacement_test_matrix();
// legacy test
extern void legacy_test_mem_throughput(uint64_t iter);
......
......@@ -68,6 +68,7 @@ float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv
);
}
_perf_g_total_samples += total_node;
_perf_blackhole(result);
return acpa;
}
......@@ -95,6 +96,7 @@ float test_same_address_load_latency(int iter, int to_csv)
);
}
_perf_g_total_samples += total_access;
_perf_blackhole(result);
return acpa;
}
......@@ -106,8 +108,8 @@ float test_read_after_write_latency(int iter, int to_csv)
// _perf_print_timer();
_perf_start_timer();
uint64_t address = _PERF_TEST_ADDR_BASE;
for (int i = 0; i < iter; i++) {
uint64_t address = _PERF_TEST_ADDR_BASE;
result += *((uint64_t*) (address));
address += sizeof(uint64_t);
}
......@@ -123,6 +125,7 @@ float test_read_after_write_latency(int iter, int to_csv)
);
}
_perf_g_total_samples += total_access;
_perf_blackhole(result);
return acpa;
}
......@@ -135,10 +138,10 @@ float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter,
// _perf_print_timer();
_perf_start_timer();
uint64_t address = _PERF_TEST_ADDR_BASE;
for (int i = 0; i < iter; i++) {
uint64_t address = _PERF_TEST_ADDR_BASE;
for (int j = 0; j < num_access; j++) {
result += *((uint64_t*) (address));
result += *((volatile uint64_t*) (address));
address += step;
}
}
......@@ -154,6 +157,7 @@ float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter,
);
}
_perf_g_total_samples += total_access;
_perf_blackhole(result);
return acpa;
}
......@@ -177,8 +181,8 @@ float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter,
// _perf_print_timer();
_perf_start_timer();
uint64_t address = _PERF_TEST_ADDR_BASE;
for (int i = 0; i < iter; i++) {
uint64_t address = _PERF_TEST_ADDR_BASE;
for (int j = 0; j < num_access; j += 8) {
register uint64_t access_addr_0 = address + address_offset_0;
register uint64_t access_addr_1 = address + address_offset_1;
......@@ -234,8 +238,8 @@ float test_linear_write_latency_batch8(uint64_t size, uint64_t step, int iter, i
// _perf_print_timer();
_perf_start_timer();
uint64_t address = _PERF_TEST_ADDR_BASE;
for (int i = 0; i < iter; i++) {
uint64_t address = _PERF_TEST_ADDR_BASE;
for (int j = 0; j < num_access; j += 8) {
register uint64_t access_addr_0 = address + address_offset_0;
register uint64_t access_addr_1 = address + address_offset_1;
......@@ -326,6 +330,7 @@ float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint6
);
}
_perf_g_total_samples += total_access;
_perf_blackhole(result);
return acpa;
}
......@@ -395,9 +400,9 @@ void generate_pointer_tracing_latency_matrix()
void generate_random_access_latency_matrix()
{
#define RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 10
#define RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 6
// RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 10: from 1KB to 512KB
#define RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 10
#define RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 6
// RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 10: from 1KB to 512KB
DEFINE_FLOAT_RESULT_MATRIX(random_access_latency,test_range_size_kb_pow2,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB,access_size_kb_pow2,RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB);
FOR(x,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { random_access_latency_row_array[x] = x; }
......
......@@ -122,6 +122,9 @@ void typical_l1_access_test_set()
printf("ideal write combine buffer bandwidth:\n");
test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 5, 0);
printf("replacement error penalty:\n");
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
}
// typical latency test for fast regression
......@@ -169,6 +172,7 @@ void latency_test_example()
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*4,_PERF_ADDR_STRIDE_L1_SAME_SET,8,0);
test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
test_same_address_load_latency(1024, 0);
......@@ -176,6 +180,32 @@ void latency_test_example()
printf("total samples: %ld\n", _perf_g_total_samples);
}
void l2_l3_pressure_test()
{
for (int i = 1; i < 16; i++) {
printf("ways accessed: %d\n", i);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
}
for (int i = 16; i <= 512; i*=2) {
printf("ways accessed: %d\n", i);
// jump at i = 32
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
}
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
// jump at i = 128
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
}
void legacy_latency_throughput_test()
{
_perf_calibrate();
......@@ -194,17 +224,23 @@ void legacy_latency_throughput_test()
// test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE,2, 0);
// printf("MEM:\n");
// test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE*2, _PERF_CACHELINE_SIZE_BYTE,2, 0);
printf("total samples: %ld\n", _perf_g_total_samples);
printf("total sampl8es: %ld\n", _perf_g_total_samples);
}
int main()
{
l2_l3_pressure_test();
return 0;
generate_replacement_test_matrix();
latency_test_example();
generate_linear_access_latency_matrix();
generate_pointer_tracing_latency_matrix();
generate_random_access_latency_matrix();
generate_replacement_test_matrix();
// matrix_print_example();
latency_test_example();
typical_latency_test();
// pointer_tracing_graph();
// latency_test();
......
#include "maprobe.h"
void generate_replacement_test_matrix()
{
#define REPLACEMENT_TEST_MAX_WAY 17 // up to 16 way
#define REPLACEMENT_TEST_ITER 5 // 1 warmup + 4 test
assert(REPLACEMENT_TEST_ITER >= 2);
DEFINE_FLOAT_RESULT_MATRIX(replacement_test,num_way_accessed,REPLACEMENT_TEST_MAX_WAY,iter,REPLACEMENT_TEST_ITER);
FOR(x,REPLACEMENT_TEST_ITER) { replacement_test_column_array[x] = x; }
for (int i = 0; i < REPLACEMENT_TEST_MAX_WAY; i++) {
replacement_test_row_array[i] = i+1;
int warm_up_iter = 64;
int test_iter = i < 4 ? 256 : 64;
replacement_test_result_array[i][0] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,warm_up_iter,0); //warmup
for(int j = 1; j < REPLACEMENT_TEST_ITER; j++) {
replacement_test_result_array[i][j] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,test_iter,0); //test
}
}
print_float_result_matrix(&replacement_test_matrix_meta);
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册