diff --git a/apps/maprobe/include/maprobe.h b/apps/maprobe/include/maprobe.h index 0991be7ef56c617dcfba16704f46cf066bb5c05d..c255eae57ba6e8f3dfe46eb9f95ecbabd648ff20 100644 --- a/apps/maprobe/include/maprobe.h +++ b/apps/maprobe/include/maprobe.h @@ -73,12 +73,20 @@ extern float test_linear_access_latency(uint64_t size, uint64_t step, int iter, extern float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv); extern float test_same_address_load_latency(int iter, int to_csv); extern float test_read_after_write_latency(int iter, int to_csv); +extern float test_linear_write_latency(uint64_t size, uint64_t step, int iter, int to_csv); + // bandwidth test extern float test_l1_load_bandwidth(uint64_t size, int iter, int to_csv); extern float test_l1_store_bandwidth(uint64_t size, int iter, int to_csv); extern float test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv); +// key parameter matrix generate +void generate_linear_access_latency_matrix(); +void generate_pointer_tracing_latency_matrix(); +void generate_random_access_latency_matrix(); + +// legacy test extern void legacy_test_mem_throughput(uint64_t iter); extern void legacy_test_mem_throughput_same_set(uint64_t iter); diff --git a/apps/maprobe/latency-test.c b/apps/maprobe/latency-test.c index f9df8da33b52597d21adad882e167d52eeb874e0..41f4a3569cdd21d4e512547700f0bae9bed900b2 100644 --- a/apps/maprobe/latency-test.c +++ b/apps/maprobe/latency-test.c @@ -67,8 +67,6 @@ float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv size/KB, iter, acpa, total_node * 8 * BYTE / (float)perf.cycle, total_node, perf.cycle ); } - - _perf_blackhole(result); _perf_g_total_samples += total_node; return acpa; } @@ -96,8 +94,6 @@ float test_same_address_load_latency(int iter, int to_csv) acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle ); } - - _perf_blackhole(result); _perf_g_total_samples += total_access; return acpa; } @@ -126,8 +122,6 @@ float test_read_after_write_latency(int iter, int to_csv) acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle ); } - - _perf_blackhole(result); _perf_g_total_samples += total_access; return acpa; } @@ -159,8 +153,6 @@ float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step ); } - - _perf_blackhole(result); _perf_g_total_samples += total_access; return acpa; } @@ -169,7 +161,6 @@ float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, { // printf("stride %d linear access latency test\n", step); // printf("range (B), read latency, iters, samples, cycles\n"); - register uint64_t result = 0; uint64_t num_access = size / step; num_access += num_access % 8 ? 8 - num_access % 8 : 0; assert(num_access >= 8); @@ -219,8 +210,63 @@ float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step ); } + _perf_g_total_samples += total_access; + return acpa; +} + +float test_linear_write_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + uint64_t num_access = size / step; + num_access += num_access % 8 ? 8 - num_access % 8 : 0; + assert(num_access >= 8); + // prepare access offset + uint64_t address_offset_0 = 0; + register uint64_t address_offset_1 = step * 1; + register uint64_t address_offset_2 = step * 2; + register uint64_t address_offset_3 = step * 3; + register uint64_t address_offset_4 = step * 4; + register uint64_t address_offset_5 = step * 5; + register uint64_t address_offset_6 = step * 6; + register uint64_t address_offset_7 = step * 7; + register uint64_t address_offset_8 = step * 8; - _perf_blackhole(result); + // _perf_print_timer(); + _perf_start_timer(); + uint64_t address = _PERF_TEST_ADDR_BASE; + for (int i = 0; i < iter; i++) { + for (int j = 0; j < num_access; j += 8) { + register uint64_t access_addr_0 = address + address_offset_0; + register uint64_t access_addr_1 = address + address_offset_1; + register uint64_t access_addr_2 = address + address_offset_2; + register uint64_t access_addr_3 = address + address_offset_3; + register uint64_t access_addr_4 = address + address_offset_4; + register uint64_t access_addr_5 = address + address_offset_5; + register uint64_t access_addr_6 = address + address_offset_6; + register uint64_t access_addr_7 = address + address_offset_7; + address += address_offset_8; + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0"); + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0"); + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = num_access * iter; + float acpa = (float)perf.cycle / total_access; // average cycle per access + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) batch(8) linear write latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step + ); + } _perf_g_total_samples += total_access; return acpa; } @@ -230,6 +276,11 @@ float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_ return test_linear_access_latency_batch8(size, step, iter, to_csv); } +float test_linear_write_latency(uint64_t size, uint64_t step, int iter, int to_csv) +{ + return test_linear_write_latency_batch8(size, step, iter, to_csv); +} + float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv) { // printf("align %d random access (cache line) latency test, %s\n", @@ -274,8 +325,6 @@ float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint6 pregen_addr ? "pregen addr" : "runtime addr" ); } - - _perf_blackhole(result); _perf_g_total_samples += total_access; return acpa; } @@ -309,3 +358,58 @@ void legacy_test_mem_throughput_same_set(uint64_t iter) *(uint64_t*) _PERF_BLACKHOLE = result; printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter); } + +void generate_linear_access_latency_matrix() +{ +#define LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14 + // LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB + DEFINE_FLOAT_RESULT_MATRIX(linear_access_latency,size_kb_pow2,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB,iter,3); + FOR(x,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { linear_access_latency_row_array[x] = x; } + FOR(x,3) { linear_access_latency_column_array[x] = x; } + for (int i = 0; i < LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB; i++) { + int warm_up_iter = i < 6 ? 4 : 1; + int test_iter = i < 6 ? 4 : 2; + linear_access_latency_result_array[i][0] = test_linear_access_latency((1<