diff --git a/apps/maprobe/Makefile b/apps/maprobe/Makefile index 0328ed17678a660d2dc273e99d246045e47b0dcf..663c48ebef40f185ec9e528b55d95c76c9039367 100644 --- a/apps/maprobe/Makefile +++ b/apps/maprobe/Makefile @@ -1,3 +1,3 @@ NAME = maprobe -SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c main.c +SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c replacement-test.c main.c include $(AM_HOME)/Makefile.app diff --git a/apps/maprobe/bandwidth-test.c b/apps/maprobe/bandwidth-test.c index ca52075b2572a7dfdb557105172387a596b941ef..549956d3c1027a95666301954ccf989a129b46d1 100644 --- a/apps/maprobe/bandwidth-test.c +++ b/apps/maprobe/bandwidth-test.c @@ -91,7 +91,7 @@ float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv) printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); } else { printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n", - size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, 8 + size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, _PERF_CACHELINE_SIZE_BYTE ); } _perf_g_total_samples += total_access; diff --git a/apps/maprobe/common.c b/apps/maprobe/common.c index 222aafc8575978220820031152ed40ff8821b384..2b5e7a961aa9d0e91ebfef3688c1c563a6544bf1 100644 --- a/apps/maprobe/common.c +++ b/apps/maprobe/common.c @@ -6,8 +6,8 @@ uint64_t _perf_g_total_samples = 0; void _perf_start_timer() { #ifndef PERF_SIM - perf.cycle = csr_read(CSR_MCYCLE); perf.instrcnt = csr_read(CSR_MINSTRET); + perf.cycle = csr_read(CSR_MCYCLE); #endif } diff --git a/apps/maprobe/include/maprobe.h b/apps/maprobe/include/maprobe.h index c255eae57ba6e8f3dfe46eb9f95ecbabd648ff20..0a1bdbd65344478fe7acc5e24c179ba2fda24db2 100644 --- a/apps/maprobe/include/maprobe.h +++ b/apps/maprobe/include/maprobe.h @@ -31,13 +31,15 @@ #define _PERF_MEM_SIZE_BYTE (1024 * MB) #define _PERF_L1_NUM_WAYS 4 #define _PERF_L1_NUM_SETS 256 +#define _PERF_L2_NUM_WAYS 8 #define _PERF_L2_NUM_SLICES 4 -// #define _PERF_L2_NUM_SETS 512 +#define _PERF_L2_NUM_SETS 512 #define _PERF_ADDR_STRIDE_L1_SAME_BANK _PERF_CACHELINE_SIZE_BYTE #define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) #define _PERF_ADDR_STRIDE_L2_SAME_SLICE (_PERF_L2_NUM_SLICES * _PERF_CACHELINE_SIZE_BYTE) -// #define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SLICES * _PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) #define _PERF_ADDR_STRIDE_NEXT_PAGE (_PERF_PAGE_SIZE_BYTE) // probe const @@ -70,6 +72,8 @@ extern uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_v extern void latency_test_warmup(uint64_t base_addr, uint64_t end_addr); extern float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv); extern float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv); +extern float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv); +extern float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv); extern float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv); extern float test_same_address_load_latency(int iter, int to_csv); extern float test_read_after_write_latency(int iter, int to_csv); @@ -85,6 +89,7 @@ extern float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv); void generate_linear_access_latency_matrix(); void generate_pointer_tracing_latency_matrix(); void generate_random_access_latency_matrix(); +void generate_replacement_test_matrix(); // legacy test extern void legacy_test_mem_throughput(uint64_t iter); diff --git a/apps/maprobe/latency-test.c b/apps/maprobe/latency-test.c index 41f4a3569cdd21d4e512547700f0bae9bed900b2..8c0620225467f656febd2fcff6058450e83881cb 100644 --- a/apps/maprobe/latency-test.c +++ b/apps/maprobe/latency-test.c @@ -68,6 +68,7 @@ float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv ); } _perf_g_total_samples += total_node; + _perf_blackhole(result); return acpa; } @@ -95,6 +96,7 @@ float test_same_address_load_latency(int iter, int to_csv) ); } _perf_g_total_samples += total_access; + _perf_blackhole(result); return acpa; } @@ -106,8 +108,8 @@ float test_read_after_write_latency(int iter, int to_csv) // _perf_print_timer(); _perf_start_timer(); - uint64_t address = _PERF_TEST_ADDR_BASE; for (int i = 0; i < iter; i++) { + uint64_t address = _PERF_TEST_ADDR_BASE; result += *((uint64_t*) (address)); address += sizeof(uint64_t); } @@ -123,6 +125,7 @@ float test_read_after_write_latency(int iter, int to_csv) ); } _perf_g_total_samples += total_access; + _perf_blackhole(result); return acpa; } @@ -135,10 +138,10 @@ float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, // _perf_print_timer(); _perf_start_timer(); - uint64_t address = _PERF_TEST_ADDR_BASE; for (int i = 0; i < iter; i++) { + uint64_t address = _PERF_TEST_ADDR_BASE; for (int j = 0; j < num_access; j++) { - result += *((uint64_t*) (address)); + result += *((volatile uint64_t*) (address)); address += step; } } @@ -154,6 +157,7 @@ float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, ); } _perf_g_total_samples += total_access; + _perf_blackhole(result); return acpa; } @@ -177,8 +181,8 @@ float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, // _perf_print_timer(); _perf_start_timer(); - uint64_t address = _PERF_TEST_ADDR_BASE; for (int i = 0; i < iter; i++) { + uint64_t address = _PERF_TEST_ADDR_BASE; for (int j = 0; j < num_access; j += 8) { register uint64_t access_addr_0 = address + address_offset_0; register uint64_t access_addr_1 = address + address_offset_1; @@ -234,8 +238,8 @@ float test_linear_write_latency_batch8(uint64_t size, uint64_t step, int iter, i // _perf_print_timer(); _perf_start_timer(); - uint64_t address = _PERF_TEST_ADDR_BASE; for (int i = 0; i < iter; i++) { + uint64_t address = _PERF_TEST_ADDR_BASE; for (int j = 0; j < num_access; j += 8) { register uint64_t access_addr_0 = address + address_offset_0; register uint64_t access_addr_1 = address + address_offset_1; @@ -326,6 +330,7 @@ float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint6 ); } _perf_g_total_samples += total_access; + _perf_blackhole(result); return acpa; } @@ -395,9 +400,9 @@ void generate_pointer_tracing_latency_matrix() void generate_random_access_latency_matrix() { -#define RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 10 +#define RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 6 // RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 10: from 1KB to 512KB -#define RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 10 +#define RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 6 // RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 10: from 1KB to 512KB DEFINE_FLOAT_RESULT_MATRIX(random_access_latency,test_range_size_kb_pow2,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB,access_size_kb_pow2,RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB); FOR(x,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { random_access_latency_row_array[x] = x; } diff --git a/apps/maprobe/main.c b/apps/maprobe/main.c index 7149876e139fcbbb556033c1c221e09c7726535c..6e9f4197dbc751f3180d645a8f33511783d51087 100644 --- a/apps/maprobe/main.c +++ b/apps/maprobe/main.c @@ -122,6 +122,9 @@ void typical_l1_access_test_set() printf("ideal write combine buffer bandwidth:\n"); test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0); test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 5, 0); + printf("replacement error penalty:\n"); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); } // typical latency test for fast regression @@ -169,6 +172,7 @@ void latency_test_example() test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0); test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0); test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*4,_PERF_ADDR_STRIDE_L1_SAME_SET,8,0); test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0); test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0); test_same_address_load_latency(1024, 0); @@ -176,6 +180,32 @@ void latency_test_example() printf("total samples: %ld\n", _perf_g_total_samples); } +void l2_l3_pressure_test() +{ + for (int i = 1; i < 16; i++) { + printf("ways accessed: %d\n", i); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0); + } + for (int i = 16; i <= 512; i*=2) { + printf("ways accessed: %d\n", i); + // jump at i = 32 + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0); + } + + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + // jump at i = 128 + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); + test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0); +} + void legacy_latency_throughput_test() { _perf_calibrate(); @@ -194,17 +224,23 @@ void legacy_latency_throughput_test() // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE,2, 0); // printf("MEM:\n"); // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE*2, _PERF_CACHELINE_SIZE_BYTE,2, 0); - printf("total samples: %ld\n", _perf_g_total_samples); + printf("total sampl8es: %ld\n", _perf_g_total_samples); } int main() { + l2_l3_pressure_test(); + return 0; + generate_replacement_test_matrix(); + + latency_test_example(); + generate_linear_access_latency_matrix(); generate_pointer_tracing_latency_matrix(); generate_random_access_latency_matrix(); + generate_replacement_test_matrix(); // matrix_print_example(); - latency_test_example(); typical_latency_test(); // pointer_tracing_graph(); // latency_test(); diff --git a/apps/maprobe/replacement-test.c b/apps/maprobe/replacement-test.c new file mode 100644 index 0000000000000000000000000000000000000000..641bce8bd3dd2e729477fc8bac3562fec12672f2 --- /dev/null +++ b/apps/maprobe/replacement-test.c @@ -0,0 +1,20 @@ +#include "maprobe.h" + +void generate_replacement_test_matrix() +{ +#define REPLACEMENT_TEST_MAX_WAY 17 // up to 16 way +#define REPLACEMENT_TEST_ITER 5 // 1 warmup + 4 test + assert(REPLACEMENT_TEST_ITER >= 2); + DEFINE_FLOAT_RESULT_MATRIX(replacement_test,num_way_accessed,REPLACEMENT_TEST_MAX_WAY,iter,REPLACEMENT_TEST_ITER); + FOR(x,REPLACEMENT_TEST_ITER) { replacement_test_column_array[x] = x; } + for (int i = 0; i < REPLACEMENT_TEST_MAX_WAY; i++) { + replacement_test_row_array[i] = i+1; + int warm_up_iter = 64; + int test_iter = i < 4 ? 256 : 64; + replacement_test_result_array[i][0] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,warm_up_iter,0); //warmup + for(int j = 1; j < REPLACEMENT_TEST_ITER; j++) { + replacement_test_result_array[i][j] = test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*(i+1),_PERF_ADDR_STRIDE_L1_SAME_SET,test_iter,0); //test + } + } + print_float_result_matrix(&replacement_test_matrix_meta); +} \ No newline at end of file